langfuse

Langfuse Python SDK
Installation
The SDK was rewritten in v3 and released in June 2025. Refer to the v3 migration guide for instructions on updating your code.
pip install langfuse
Docs
Please see our docs for detailed information on this SDK.
1""".. include:: ../README.md""" 2 3from langfuse.batch_evaluation import ( 4 BatchEvaluationResult, 5 BatchEvaluationResumeToken, 6 CompositeEvaluatorFunction, 7 EvaluatorInputs, 8 EvaluatorStats, 9 MapperFunction, 10) 11from langfuse.experiment import Evaluation 12 13from ._client import client as _client_module 14from ._client.attributes import LangfuseOtelSpanAttributes 15from ._client.constants import ObservationTypeLiteral 16from ._client.get_client import get_client 17from ._client.observe import observe 18from ._client.propagation import propagate_attributes 19from ._client.span import ( 20 LangfuseAgent, 21 LangfuseChain, 22 LangfuseEmbedding, 23 LangfuseEvaluator, 24 LangfuseEvent, 25 LangfuseGeneration, 26 LangfuseGuardrail, 27 LangfuseRetriever, 28 LangfuseSpan, 29 LangfuseTool, 30) 31from .span_filter import ( 32 KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES, 33 is_default_export_span, 34 is_genai_span, 35 is_known_llm_instrumentor, 36 is_langfuse_span, 37) 38 39Langfuse = _client_module.Langfuse 40 41__all__ = [ 42 "Langfuse", 43 "get_client", 44 "observe", 45 "propagate_attributes", 46 "ObservationTypeLiteral", 47 "LangfuseSpan", 48 "LangfuseGeneration", 49 "LangfuseEvent", 50 "LangfuseOtelSpanAttributes", 51 "LangfuseAgent", 52 "LangfuseTool", 53 "LangfuseChain", 54 "LangfuseEmbedding", 55 "LangfuseEvaluator", 56 "LangfuseRetriever", 57 "LangfuseGuardrail", 58 "Evaluation", 59 "EvaluatorInputs", 60 "MapperFunction", 61 "CompositeEvaluatorFunction", 62 "EvaluatorStats", 63 "BatchEvaluationResumeToken", 64 "BatchEvaluationResult", 65 "is_default_export_span", 66 "is_langfuse_span", 67 "is_genai_span", 68 "is_known_llm_instrumentor", 69 "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES", 70 "experiment", 71 "api", 72]
132class Langfuse: 133 """Main client for Langfuse tracing and platform features. 134 135 This class provides an interface for creating and managing traces, spans, 136 and generations in Langfuse as well as interacting with the Langfuse API. 137 138 The client features a thread-safe singleton pattern for each unique public API key, 139 ensuring consistent trace context propagation across your application. It implements 140 efficient batching of spans with configurable flush settings and includes background 141 thread management for media uploads and score ingestion. 142 143 Configuration is flexible through either direct parameters or environment variables, 144 with graceful fallbacks and runtime configuration updates. 145 146 Attributes: 147 api: Synchronous API client for Langfuse backend communication 148 async_api: Asynchronous API client for Langfuse backend communication 149 _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components 150 151 Parameters: 152 public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable. 153 secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable. 154 base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable. 155 host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com". 156 timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds. 157 httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created. 158 debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable. 159 tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable. 160 flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable. 161 flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable. 162 environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'. 163 release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release. 164 media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable. 165 sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable. 166 mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API. 167 blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior: 168 ```python 169 from langfuse.span_filter import is_default_export_span 170 blocked = {"sqlite", "requests"} 171 172 should_export_span = lambda span: ( 173 is_default_export_span(span) 174 and ( 175 span.instrumentation_scope is None 176 or span.instrumentation_scope.name not in blocked 177 ) 178 ) 179 ``` 180 should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes). 181 additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. 182 tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees. 183 184 Example: 185 ```python 186 from langfuse.otel import Langfuse 187 188 # Initialize the client (reads from env vars if not provided) 189 langfuse = Langfuse( 190 public_key="your-public-key", 191 secret_key="your-secret-key", 192 host="https://cloud.langfuse.com", # Optional, default shown 193 ) 194 195 # Create a trace span 196 with langfuse.start_as_current_observation(name="process-query") as span: 197 # Your application code here 198 199 # Create a nested generation span for an LLM call 200 with span.start_as_current_generation( 201 name="generate-response", 202 model="gpt-4", 203 input={"query": "Tell me about AI"}, 204 model_parameters={"temperature": 0.7, "max_tokens": 500} 205 ) as generation: 206 # Generate response here 207 response = "AI is a field of computer science..." 208 209 generation.update( 210 output=response, 211 usage_details={"prompt_tokens": 10, "completion_tokens": 50}, 212 cost_details={"total_cost": 0.0023} 213 ) 214 215 # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) 216 generation.score(name="relevance", value=0.95, data_type="NUMERIC") 217 ``` 218 """ 219 220 _resources: Optional[LangfuseResourceManager] = None 221 _mask: Optional[MaskFunction] = None 222 _otel_tracer: otel_trace_api.Tracer 223 224 def __init__( 225 self, 226 *, 227 public_key: Optional[str] = None, 228 secret_key: Optional[str] = None, 229 base_url: Optional[str] = None, 230 host: Optional[str] = None, 231 timeout: Optional[int] = None, 232 httpx_client: Optional[httpx.Client] = None, 233 debug: bool = False, 234 tracing_enabled: Optional[bool] = True, 235 flush_at: Optional[int] = None, 236 flush_interval: Optional[float] = None, 237 environment: Optional[str] = None, 238 release: Optional[str] = None, 239 media_upload_thread_count: Optional[int] = None, 240 sample_rate: Optional[float] = None, 241 mask: Optional[MaskFunction] = None, 242 blocked_instrumentation_scopes: Optional[List[str]] = None, 243 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 244 additional_headers: Optional[Dict[str, str]] = None, 245 tracer_provider: Optional[TracerProvider] = None, 246 ): 247 self._base_url = ( 248 base_url 249 or os.environ.get(LANGFUSE_BASE_URL) 250 or host 251 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 252 ) 253 self._environment = environment or cast( 254 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 255 ) 256 self._project_id: Optional[str] = None 257 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 258 if not 0.0 <= sample_rate <= 1.0: 259 raise ValueError( 260 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 261 ) 262 263 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 264 265 self._tracing_enabled = ( 266 tracing_enabled 267 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 268 ) 269 if not self._tracing_enabled: 270 langfuse_logger.info( 271 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 272 ) 273 274 debug = ( 275 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 276 ) 277 if debug: 278 logging.basicConfig( 279 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 280 ) 281 langfuse_logger.setLevel(logging.DEBUG) 282 283 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 284 if public_key is None: 285 langfuse_logger.warning( 286 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 287 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 288 ) 289 self._otel_tracer = otel_trace_api.NoOpTracer() 290 return 291 292 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 293 if secret_key is None: 294 langfuse_logger.warning( 295 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 296 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 297 ) 298 self._otel_tracer = otel_trace_api.NoOpTracer() 299 return 300 301 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 302 langfuse_logger.warning( 303 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 304 ) 305 306 if blocked_instrumentation_scopes is not None: 307 warnings.warn( 308 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 309 "Use `should_export_span` instead. Example: " 310 "from langfuse.span_filter import is_default_export_span; " 311 'blocked={"scope"}; should_export_span=lambda span: ' 312 "is_default_export_span(span) and (span.instrumentation_scope is None or " 313 "span.instrumentation_scope.name not in blocked).", 314 DeprecationWarning, 315 stacklevel=2, 316 ) 317 318 # Initialize api and tracer if requirements are met 319 self._resources = LangfuseResourceManager( 320 public_key=public_key, 321 secret_key=secret_key, 322 base_url=self._base_url, 323 timeout=timeout, 324 environment=self._environment, 325 release=release, 326 flush_at=flush_at, 327 flush_interval=flush_interval, 328 httpx_client=httpx_client, 329 media_upload_thread_count=media_upload_thread_count, 330 sample_rate=sample_rate, 331 mask=mask, 332 tracing_enabled=self._tracing_enabled, 333 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 334 should_export_span=should_export_span, 335 additional_headers=additional_headers, 336 tracer_provider=tracer_provider, 337 ) 338 self._mask = self._resources.mask 339 340 self._otel_tracer = ( 341 self._resources.tracer 342 if self._tracing_enabled and self._resources.tracer is not None 343 else otel_trace_api.NoOpTracer() 344 ) 345 self.api = self._resources.api 346 self.async_api = self._resources.async_api 347 348 @overload 349 def start_observation( 350 self, 351 *, 352 trace_context: Optional[TraceContext] = None, 353 name: str, 354 as_type: Literal["generation"], 355 input: Optional[Any] = None, 356 output: Optional[Any] = None, 357 metadata: Optional[Any] = None, 358 version: Optional[str] = None, 359 level: Optional[SpanLevel] = None, 360 status_message: Optional[str] = None, 361 completion_start_time: Optional[datetime] = None, 362 model: Optional[str] = None, 363 model_parameters: Optional[Dict[str, MapValue]] = None, 364 usage_details: Optional[Dict[str, int]] = None, 365 cost_details: Optional[Dict[str, float]] = None, 366 prompt: Optional[PromptClient] = None, 367 ) -> LangfuseGeneration: ... 368 369 @overload 370 def start_observation( 371 self, 372 *, 373 trace_context: Optional[TraceContext] = None, 374 name: str, 375 as_type: Literal["span"] = "span", 376 input: Optional[Any] = None, 377 output: Optional[Any] = None, 378 metadata: Optional[Any] = None, 379 version: Optional[str] = None, 380 level: Optional[SpanLevel] = None, 381 status_message: Optional[str] = None, 382 ) -> LangfuseSpan: ... 383 384 @overload 385 def start_observation( 386 self, 387 *, 388 trace_context: Optional[TraceContext] = None, 389 name: str, 390 as_type: Literal["agent"], 391 input: Optional[Any] = None, 392 output: Optional[Any] = None, 393 metadata: Optional[Any] = None, 394 version: Optional[str] = None, 395 level: Optional[SpanLevel] = None, 396 status_message: Optional[str] = None, 397 ) -> LangfuseAgent: ... 398 399 @overload 400 def start_observation( 401 self, 402 *, 403 trace_context: Optional[TraceContext] = None, 404 name: str, 405 as_type: Literal["tool"], 406 input: Optional[Any] = None, 407 output: Optional[Any] = None, 408 metadata: Optional[Any] = None, 409 version: Optional[str] = None, 410 level: Optional[SpanLevel] = None, 411 status_message: Optional[str] = None, 412 ) -> LangfuseTool: ... 413 414 @overload 415 def start_observation( 416 self, 417 *, 418 trace_context: Optional[TraceContext] = None, 419 name: str, 420 as_type: Literal["chain"], 421 input: Optional[Any] = None, 422 output: Optional[Any] = None, 423 metadata: Optional[Any] = None, 424 version: Optional[str] = None, 425 level: Optional[SpanLevel] = None, 426 status_message: Optional[str] = None, 427 ) -> LangfuseChain: ... 428 429 @overload 430 def start_observation( 431 self, 432 *, 433 trace_context: Optional[TraceContext] = None, 434 name: str, 435 as_type: Literal["retriever"], 436 input: Optional[Any] = None, 437 output: Optional[Any] = None, 438 metadata: Optional[Any] = None, 439 version: Optional[str] = None, 440 level: Optional[SpanLevel] = None, 441 status_message: Optional[str] = None, 442 ) -> LangfuseRetriever: ... 443 444 @overload 445 def start_observation( 446 self, 447 *, 448 trace_context: Optional[TraceContext] = None, 449 name: str, 450 as_type: Literal["evaluator"], 451 input: Optional[Any] = None, 452 output: Optional[Any] = None, 453 metadata: Optional[Any] = None, 454 version: Optional[str] = None, 455 level: Optional[SpanLevel] = None, 456 status_message: Optional[str] = None, 457 ) -> LangfuseEvaluator: ... 458 459 @overload 460 def start_observation( 461 self, 462 *, 463 trace_context: Optional[TraceContext] = None, 464 name: str, 465 as_type: Literal["embedding"], 466 input: Optional[Any] = None, 467 output: Optional[Any] = None, 468 metadata: Optional[Any] = None, 469 version: Optional[str] = None, 470 level: Optional[SpanLevel] = None, 471 status_message: Optional[str] = None, 472 completion_start_time: Optional[datetime] = None, 473 model: Optional[str] = None, 474 model_parameters: Optional[Dict[str, MapValue]] = None, 475 usage_details: Optional[Dict[str, int]] = None, 476 cost_details: Optional[Dict[str, float]] = None, 477 prompt: Optional[PromptClient] = None, 478 ) -> LangfuseEmbedding: ... 479 480 @overload 481 def start_observation( 482 self, 483 *, 484 trace_context: Optional[TraceContext] = None, 485 name: str, 486 as_type: Literal["guardrail"], 487 input: Optional[Any] = None, 488 output: Optional[Any] = None, 489 metadata: Optional[Any] = None, 490 version: Optional[str] = None, 491 level: Optional[SpanLevel] = None, 492 status_message: Optional[str] = None, 493 ) -> LangfuseGuardrail: ... 494 495 def start_observation( 496 self, 497 *, 498 trace_context: Optional[TraceContext] = None, 499 name: str, 500 as_type: ObservationTypeLiteralNoEvent = "span", 501 input: Optional[Any] = None, 502 output: Optional[Any] = None, 503 metadata: Optional[Any] = None, 504 version: Optional[str] = None, 505 level: Optional[SpanLevel] = None, 506 status_message: Optional[str] = None, 507 completion_start_time: Optional[datetime] = None, 508 model: Optional[str] = None, 509 model_parameters: Optional[Dict[str, MapValue]] = None, 510 usage_details: Optional[Dict[str, int]] = None, 511 cost_details: Optional[Dict[str, float]] = None, 512 prompt: Optional[PromptClient] = None, 513 ) -> Union[ 514 LangfuseSpan, 515 LangfuseGeneration, 516 LangfuseAgent, 517 LangfuseTool, 518 LangfuseChain, 519 LangfuseRetriever, 520 LangfuseEvaluator, 521 LangfuseEmbedding, 522 LangfuseGuardrail, 523 ]: 524 """Create a new observation of the specified type. 525 526 This method creates a new observation but does not set it as the current span in the 527 context. To create and use an observation within a context, use start_as_current_observation(). 528 529 Args: 530 trace_context: Optional context for connecting to an existing trace 531 name: Name of the observation 532 as_type: Type of observation to create (defaults to "span") 533 input: Input data for the operation 534 output: Output data from the operation 535 metadata: Additional metadata to associate with the observation 536 version: Version identifier for the code or component 537 level: Importance level of the observation 538 status_message: Optional status message for the observation 539 completion_start_time: When the model started generating (for generation types) 540 model: Name/identifier of the AI model used (for generation types) 541 model_parameters: Parameters used for the model (for generation types) 542 usage_details: Token usage information (for generation types) 543 cost_details: Cost information (for generation types) 544 prompt: Associated prompt template (for generation types) 545 546 Returns: 547 An observation object of the appropriate type that must be ended with .end() 548 """ 549 if trace_context: 550 trace_id = trace_context.get("trace_id", None) 551 parent_span_id = trace_context.get("parent_span_id", None) 552 553 if trace_id: 554 remote_parent_span = self._create_remote_parent_span( 555 trace_id=trace_id, parent_span_id=parent_span_id 556 ) 557 558 with otel_trace_api.use_span( 559 cast(otel_trace_api.Span, remote_parent_span) 560 ): 561 otel_span = self._otel_tracer.start_span(name=name) 562 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 563 564 return self._create_observation_from_otel_span( 565 otel_span=otel_span, 566 as_type=as_type, 567 input=input, 568 output=output, 569 metadata=metadata, 570 version=version, 571 level=level, 572 status_message=status_message, 573 completion_start_time=completion_start_time, 574 model=model, 575 model_parameters=model_parameters, 576 usage_details=usage_details, 577 cost_details=cost_details, 578 prompt=prompt, 579 ) 580 581 otel_span = self._otel_tracer.start_span(name=name) 582 583 return self._create_observation_from_otel_span( 584 otel_span=otel_span, 585 as_type=as_type, 586 input=input, 587 output=output, 588 metadata=metadata, 589 version=version, 590 level=level, 591 status_message=status_message, 592 completion_start_time=completion_start_time, 593 model=model, 594 model_parameters=model_parameters, 595 usage_details=usage_details, 596 cost_details=cost_details, 597 prompt=prompt, 598 ) 599 600 def _create_observation_from_otel_span( 601 self, 602 *, 603 otel_span: otel_trace_api.Span, 604 as_type: ObservationTypeLiteralNoEvent, 605 input: Optional[Any] = None, 606 output: Optional[Any] = None, 607 metadata: Optional[Any] = None, 608 version: Optional[str] = None, 609 level: Optional[SpanLevel] = None, 610 status_message: Optional[str] = None, 611 completion_start_time: Optional[datetime] = None, 612 model: Optional[str] = None, 613 model_parameters: Optional[Dict[str, MapValue]] = None, 614 usage_details: Optional[Dict[str, int]] = None, 615 cost_details: Optional[Dict[str, float]] = None, 616 prompt: Optional[PromptClient] = None, 617 ) -> Union[ 618 LangfuseSpan, 619 LangfuseGeneration, 620 LangfuseAgent, 621 LangfuseTool, 622 LangfuseChain, 623 LangfuseRetriever, 624 LangfuseEvaluator, 625 LangfuseEmbedding, 626 LangfuseGuardrail, 627 ]: 628 """Create the appropriate observation type from an OTEL span.""" 629 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 630 observation_class = self._get_span_class(as_type) 631 # Type ignore to prevent overloads of internal _get_span_class function, 632 # issue is that LangfuseEvent could be returned and that classes have diff. args 633 return observation_class( # type: ignore[return-value,call-arg] 634 otel_span=otel_span, 635 langfuse_client=self, 636 environment=self._environment, 637 input=input, 638 output=output, 639 metadata=metadata, 640 version=version, 641 level=level, 642 status_message=status_message, 643 completion_start_time=completion_start_time, 644 model=model, 645 model_parameters=model_parameters, 646 usage_details=usage_details, 647 cost_details=cost_details, 648 prompt=prompt, 649 ) 650 else: 651 # For other types (e.g. span, guardrail), create appropriate class without generation properties 652 observation_class = self._get_span_class(as_type) 653 # Type ignore to prevent overloads of internal _get_span_class function, 654 # issue is that LangfuseEvent could be returned and that classes have diff. args 655 return observation_class( # type: ignore[return-value,call-arg] 656 otel_span=otel_span, 657 langfuse_client=self, 658 environment=self._environment, 659 input=input, 660 output=output, 661 metadata=metadata, 662 version=version, 663 level=level, 664 status_message=status_message, 665 ) 666 # span._observation_type = as_type 667 # span._otel_span.set_attribute("langfuse.observation.type", as_type) 668 # return span 669 670 @overload 671 def start_as_current_observation( 672 self, 673 *, 674 trace_context: Optional[TraceContext] = None, 675 name: str, 676 as_type: Literal["generation"], 677 input: Optional[Any] = None, 678 output: Optional[Any] = None, 679 metadata: Optional[Any] = None, 680 version: Optional[str] = None, 681 level: Optional[SpanLevel] = None, 682 status_message: Optional[str] = None, 683 completion_start_time: Optional[datetime] = None, 684 model: Optional[str] = None, 685 model_parameters: Optional[Dict[str, MapValue]] = None, 686 usage_details: Optional[Dict[str, int]] = None, 687 cost_details: Optional[Dict[str, float]] = None, 688 prompt: Optional[PromptClient] = None, 689 end_on_exit: Optional[bool] = None, 690 ) -> _AgnosticContextManager[LangfuseGeneration]: ... 691 692 @overload 693 def start_as_current_observation( 694 self, 695 *, 696 trace_context: Optional[TraceContext] = None, 697 name: str, 698 as_type: Literal["span"] = "span", 699 input: Optional[Any] = None, 700 output: Optional[Any] = None, 701 metadata: Optional[Any] = None, 702 version: Optional[str] = None, 703 level: Optional[SpanLevel] = None, 704 status_message: Optional[str] = None, 705 end_on_exit: Optional[bool] = None, 706 ) -> _AgnosticContextManager[LangfuseSpan]: ... 707 708 @overload 709 def start_as_current_observation( 710 self, 711 *, 712 trace_context: Optional[TraceContext] = None, 713 name: str, 714 as_type: Literal["agent"], 715 input: Optional[Any] = None, 716 output: Optional[Any] = None, 717 metadata: Optional[Any] = None, 718 version: Optional[str] = None, 719 level: Optional[SpanLevel] = None, 720 status_message: Optional[str] = None, 721 end_on_exit: Optional[bool] = None, 722 ) -> _AgnosticContextManager[LangfuseAgent]: ... 723 724 @overload 725 def start_as_current_observation( 726 self, 727 *, 728 trace_context: Optional[TraceContext] = None, 729 name: str, 730 as_type: Literal["tool"], 731 input: Optional[Any] = None, 732 output: Optional[Any] = None, 733 metadata: Optional[Any] = None, 734 version: Optional[str] = None, 735 level: Optional[SpanLevel] = None, 736 status_message: Optional[str] = None, 737 end_on_exit: Optional[bool] = None, 738 ) -> _AgnosticContextManager[LangfuseTool]: ... 739 740 @overload 741 def start_as_current_observation( 742 self, 743 *, 744 trace_context: Optional[TraceContext] = None, 745 name: str, 746 as_type: Literal["chain"], 747 input: Optional[Any] = None, 748 output: Optional[Any] = None, 749 metadata: Optional[Any] = None, 750 version: Optional[str] = None, 751 level: Optional[SpanLevel] = None, 752 status_message: Optional[str] = None, 753 end_on_exit: Optional[bool] = None, 754 ) -> _AgnosticContextManager[LangfuseChain]: ... 755 756 @overload 757 def start_as_current_observation( 758 self, 759 *, 760 trace_context: Optional[TraceContext] = None, 761 name: str, 762 as_type: Literal["retriever"], 763 input: Optional[Any] = None, 764 output: Optional[Any] = None, 765 metadata: Optional[Any] = None, 766 version: Optional[str] = None, 767 level: Optional[SpanLevel] = None, 768 status_message: Optional[str] = None, 769 end_on_exit: Optional[bool] = None, 770 ) -> _AgnosticContextManager[LangfuseRetriever]: ... 771 772 @overload 773 def start_as_current_observation( 774 self, 775 *, 776 trace_context: Optional[TraceContext] = None, 777 name: str, 778 as_type: Literal["evaluator"], 779 input: Optional[Any] = None, 780 output: Optional[Any] = None, 781 metadata: Optional[Any] = None, 782 version: Optional[str] = None, 783 level: Optional[SpanLevel] = None, 784 status_message: Optional[str] = None, 785 end_on_exit: Optional[bool] = None, 786 ) -> _AgnosticContextManager[LangfuseEvaluator]: ... 787 788 @overload 789 def start_as_current_observation( 790 self, 791 *, 792 trace_context: Optional[TraceContext] = None, 793 name: str, 794 as_type: Literal["embedding"], 795 input: Optional[Any] = None, 796 output: Optional[Any] = None, 797 metadata: Optional[Any] = None, 798 version: Optional[str] = None, 799 level: Optional[SpanLevel] = None, 800 status_message: Optional[str] = None, 801 completion_start_time: Optional[datetime] = None, 802 model: Optional[str] = None, 803 model_parameters: Optional[Dict[str, MapValue]] = None, 804 usage_details: Optional[Dict[str, int]] = None, 805 cost_details: Optional[Dict[str, float]] = None, 806 prompt: Optional[PromptClient] = None, 807 end_on_exit: Optional[bool] = None, 808 ) -> _AgnosticContextManager[LangfuseEmbedding]: ... 809 810 @overload 811 def start_as_current_observation( 812 self, 813 *, 814 trace_context: Optional[TraceContext] = None, 815 name: str, 816 as_type: Literal["guardrail"], 817 input: Optional[Any] = None, 818 output: Optional[Any] = None, 819 metadata: Optional[Any] = None, 820 version: Optional[str] = None, 821 level: Optional[SpanLevel] = None, 822 status_message: Optional[str] = None, 823 end_on_exit: Optional[bool] = None, 824 ) -> _AgnosticContextManager[LangfuseGuardrail]: ... 825 826 def start_as_current_observation( 827 self, 828 *, 829 trace_context: Optional[TraceContext] = None, 830 name: str, 831 as_type: ObservationTypeLiteralNoEvent = "span", 832 input: Optional[Any] = None, 833 output: Optional[Any] = None, 834 metadata: Optional[Any] = None, 835 version: Optional[str] = None, 836 level: Optional[SpanLevel] = None, 837 status_message: Optional[str] = None, 838 completion_start_time: Optional[datetime] = None, 839 model: Optional[str] = None, 840 model_parameters: Optional[Dict[str, MapValue]] = None, 841 usage_details: Optional[Dict[str, int]] = None, 842 cost_details: Optional[Dict[str, float]] = None, 843 prompt: Optional[PromptClient] = None, 844 end_on_exit: Optional[bool] = None, 845 ) -> Union[ 846 _AgnosticContextManager[LangfuseGeneration], 847 _AgnosticContextManager[LangfuseSpan], 848 _AgnosticContextManager[LangfuseAgent], 849 _AgnosticContextManager[LangfuseTool], 850 _AgnosticContextManager[LangfuseChain], 851 _AgnosticContextManager[LangfuseRetriever], 852 _AgnosticContextManager[LangfuseEvaluator], 853 _AgnosticContextManager[LangfuseEmbedding], 854 _AgnosticContextManager[LangfuseGuardrail], 855 ]: 856 """Create a new observation and set it as the current span in a context manager. 857 858 This method creates a new observation of the specified type and sets it as the 859 current span within a context manager. Use this method with a 'with' statement to 860 automatically handle the observation lifecycle within a code block. 861 862 The created observation will be the child of the current span in the context. 863 864 Args: 865 trace_context: Optional context for connecting to an existing trace 866 name: Name of the observation (e.g., function or operation name) 867 as_type: Type of observation to create (defaults to "span") 868 input: Input data for the operation (can be any JSON-serializable object) 869 output: Output data from the operation (can be any JSON-serializable object) 870 metadata: Additional metadata to associate with the observation 871 version: Version identifier for the code or component 872 level: Importance level of the observation (info, warning, error) 873 status_message: Optional status message for the observation 874 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 875 876 The following parameters are available when as_type is: "generation" or "embedding". 877 completion_start_time: When the model started generating the response 878 model: Name/identifier of the AI model used (e.g., "gpt-4") 879 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 880 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 881 cost_details: Cost information for the model call 882 prompt: Associated prompt template from Langfuse prompt management 883 884 Returns: 885 A context manager that yields the appropriate observation type based on as_type 886 887 Example: 888 ```python 889 # Create a span 890 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 891 # Do work 892 result = process_data() 893 span.update(output=result) 894 895 # Create a child span automatically 896 with span.start_as_current_observation(name="sub-operation") as child_span: 897 # Do sub-operation work 898 child_span.update(output="sub-result") 899 900 # Create a tool observation 901 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 902 # Do tool work 903 results = search_web(query) 904 tool.update(output=results) 905 906 # Create a generation observation 907 with langfuse.start_as_current_observation( 908 name="answer-generation", 909 as_type="generation", 910 model="gpt-4" 911 ) as generation: 912 # Generate answer 913 response = llm.generate(...) 914 generation.update(output=response) 915 ``` 916 """ 917 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 918 if trace_context: 919 trace_id = trace_context.get("trace_id", None) 920 parent_span_id = trace_context.get("parent_span_id", None) 921 922 if trace_id: 923 remote_parent_span = self._create_remote_parent_span( 924 trace_id=trace_id, parent_span_id=parent_span_id 925 ) 926 927 return cast( 928 Union[ 929 _AgnosticContextManager[LangfuseGeneration], 930 _AgnosticContextManager[LangfuseEmbedding], 931 ], 932 self._create_span_with_parent_context( 933 as_type=as_type, 934 name=name, 935 remote_parent_span=remote_parent_span, 936 parent=None, 937 end_on_exit=end_on_exit, 938 input=input, 939 output=output, 940 metadata=metadata, 941 version=version, 942 level=level, 943 status_message=status_message, 944 completion_start_time=completion_start_time, 945 model=model, 946 model_parameters=model_parameters, 947 usage_details=usage_details, 948 cost_details=cost_details, 949 prompt=prompt, 950 ), 951 ) 952 953 return cast( 954 Union[ 955 _AgnosticContextManager[LangfuseGeneration], 956 _AgnosticContextManager[LangfuseEmbedding], 957 ], 958 self._start_as_current_otel_span_with_processed_media( 959 as_type=as_type, 960 name=name, 961 end_on_exit=end_on_exit, 962 input=input, 963 output=output, 964 metadata=metadata, 965 version=version, 966 level=level, 967 status_message=status_message, 968 completion_start_time=completion_start_time, 969 model=model, 970 model_parameters=model_parameters, 971 usage_details=usage_details, 972 cost_details=cost_details, 973 prompt=prompt, 974 ), 975 ) 976 977 if as_type in get_observation_types_list(ObservationTypeSpanLike): 978 if trace_context: 979 trace_id = trace_context.get("trace_id", None) 980 parent_span_id = trace_context.get("parent_span_id", None) 981 982 if trace_id: 983 remote_parent_span = self._create_remote_parent_span( 984 trace_id=trace_id, parent_span_id=parent_span_id 985 ) 986 987 return cast( 988 Union[ 989 _AgnosticContextManager[LangfuseSpan], 990 _AgnosticContextManager[LangfuseAgent], 991 _AgnosticContextManager[LangfuseTool], 992 _AgnosticContextManager[LangfuseChain], 993 _AgnosticContextManager[LangfuseRetriever], 994 _AgnosticContextManager[LangfuseEvaluator], 995 _AgnosticContextManager[LangfuseGuardrail], 996 ], 997 self._create_span_with_parent_context( 998 as_type=as_type, 999 name=name, 1000 remote_parent_span=remote_parent_span, 1001 parent=None, 1002 end_on_exit=end_on_exit, 1003 input=input, 1004 output=output, 1005 metadata=metadata, 1006 version=version, 1007 level=level, 1008 status_message=status_message, 1009 ), 1010 ) 1011 1012 return cast( 1013 Union[ 1014 _AgnosticContextManager[LangfuseSpan], 1015 _AgnosticContextManager[LangfuseAgent], 1016 _AgnosticContextManager[LangfuseTool], 1017 _AgnosticContextManager[LangfuseChain], 1018 _AgnosticContextManager[LangfuseRetriever], 1019 _AgnosticContextManager[LangfuseEvaluator], 1020 _AgnosticContextManager[LangfuseGuardrail], 1021 ], 1022 self._start_as_current_otel_span_with_processed_media( 1023 as_type=as_type, 1024 name=name, 1025 end_on_exit=end_on_exit, 1026 input=input, 1027 output=output, 1028 metadata=metadata, 1029 version=version, 1030 level=level, 1031 status_message=status_message, 1032 ), 1033 ) 1034 1035 # This should never be reached since all valid types are handled above 1036 langfuse_logger.warning( 1037 f"Unknown observation type: {as_type}, falling back to span" 1038 ) 1039 return self._start_as_current_otel_span_with_processed_media( 1040 as_type="span", 1041 name=name, 1042 end_on_exit=end_on_exit, 1043 input=input, 1044 output=output, 1045 metadata=metadata, 1046 version=version, 1047 level=level, 1048 status_message=status_message, 1049 ) 1050 1051 def _get_span_class( 1052 self, 1053 as_type: ObservationTypeLiteral, 1054 ) -> Union[ 1055 Type[LangfuseAgent], 1056 Type[LangfuseTool], 1057 Type[LangfuseChain], 1058 Type[LangfuseRetriever], 1059 Type[LangfuseEvaluator], 1060 Type[LangfuseEmbedding], 1061 Type[LangfuseGuardrail], 1062 Type[LangfuseGeneration], 1063 Type[LangfuseEvent], 1064 Type[LangfuseSpan], 1065 ]: 1066 """Get the appropriate span class based on as_type.""" 1067 normalized_type = as_type.lower() 1068 1069 if normalized_type == "agent": 1070 return LangfuseAgent 1071 elif normalized_type == "tool": 1072 return LangfuseTool 1073 elif normalized_type == "chain": 1074 return LangfuseChain 1075 elif normalized_type == "retriever": 1076 return LangfuseRetriever 1077 elif normalized_type == "evaluator": 1078 return LangfuseEvaluator 1079 elif normalized_type == "embedding": 1080 return LangfuseEmbedding 1081 elif normalized_type == "guardrail": 1082 return LangfuseGuardrail 1083 elif normalized_type == "generation": 1084 return LangfuseGeneration 1085 elif normalized_type == "event": 1086 return LangfuseEvent 1087 elif normalized_type == "span": 1088 return LangfuseSpan 1089 else: 1090 return LangfuseSpan 1091 1092 @_agnosticcontextmanager 1093 def _create_span_with_parent_context( 1094 self, 1095 *, 1096 name: str, 1097 parent: Optional[otel_trace_api.Span] = None, 1098 remote_parent_span: Optional[otel_trace_api.Span] = None, 1099 as_type: ObservationTypeLiteralNoEvent, 1100 end_on_exit: Optional[bool] = None, 1101 input: Optional[Any] = None, 1102 output: Optional[Any] = None, 1103 metadata: Optional[Any] = None, 1104 version: Optional[str] = None, 1105 level: Optional[SpanLevel] = None, 1106 status_message: Optional[str] = None, 1107 completion_start_time: Optional[datetime] = None, 1108 model: Optional[str] = None, 1109 model_parameters: Optional[Dict[str, MapValue]] = None, 1110 usage_details: Optional[Dict[str, int]] = None, 1111 cost_details: Optional[Dict[str, float]] = None, 1112 prompt: Optional[PromptClient] = None, 1113 ) -> Any: 1114 parent_span = parent or cast(otel_trace_api.Span, remote_parent_span) 1115 1116 with otel_trace_api.use_span(parent_span): 1117 with self._start_as_current_otel_span_with_processed_media( 1118 name=name, 1119 as_type=as_type, 1120 end_on_exit=end_on_exit, 1121 input=input, 1122 output=output, 1123 metadata=metadata, 1124 version=version, 1125 level=level, 1126 status_message=status_message, 1127 completion_start_time=completion_start_time, 1128 model=model, 1129 model_parameters=model_parameters, 1130 usage_details=usage_details, 1131 cost_details=cost_details, 1132 prompt=prompt, 1133 ) as langfuse_span: 1134 if remote_parent_span is not None: 1135 langfuse_span._otel_span.set_attribute( 1136 LangfuseOtelSpanAttributes.AS_ROOT, True 1137 ) 1138 1139 yield langfuse_span 1140 1141 @_agnosticcontextmanager 1142 def _start_as_current_otel_span_with_processed_media( 1143 self, 1144 *, 1145 name: str, 1146 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 1147 end_on_exit: Optional[bool] = None, 1148 input: Optional[Any] = None, 1149 output: Optional[Any] = None, 1150 metadata: Optional[Any] = None, 1151 version: Optional[str] = None, 1152 level: Optional[SpanLevel] = None, 1153 status_message: Optional[str] = None, 1154 completion_start_time: Optional[datetime] = None, 1155 model: Optional[str] = None, 1156 model_parameters: Optional[Dict[str, MapValue]] = None, 1157 usage_details: Optional[Dict[str, int]] = None, 1158 cost_details: Optional[Dict[str, float]] = None, 1159 prompt: Optional[PromptClient] = None, 1160 ) -> Any: 1161 with self._otel_tracer.start_as_current_span( 1162 name=name, 1163 end_on_exit=end_on_exit if end_on_exit is not None else True, 1164 ) as otel_span: 1165 span_class = self._get_span_class( 1166 as_type or "generation" 1167 ) # default was "generation" 1168 common_args = { 1169 "otel_span": otel_span, 1170 "langfuse_client": self, 1171 "environment": self._environment, 1172 "input": input, 1173 "output": output, 1174 "metadata": metadata, 1175 "version": version, 1176 "level": level, 1177 "status_message": status_message, 1178 } 1179 1180 if span_class in [ 1181 LangfuseGeneration, 1182 LangfuseEmbedding, 1183 ]: 1184 common_args.update( 1185 { 1186 "completion_start_time": completion_start_time, 1187 "model": model, 1188 "model_parameters": model_parameters, 1189 "usage_details": usage_details, 1190 "cost_details": cost_details, 1191 "prompt": prompt, 1192 } 1193 ) 1194 # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed 1195 1196 yield span_class(**common_args) # type: ignore[arg-type] 1197 1198 def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]: 1199 current_span = otel_trace_api.get_current_span() 1200 1201 if current_span is otel_trace_api.INVALID_SPAN: 1202 langfuse_logger.warning( 1203 "Context error: No active span in current context. Operations that depend on an active span will be skipped. " 1204 "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context." 1205 ) 1206 return None 1207 1208 return current_span 1209 1210 def update_current_generation( 1211 self, 1212 *, 1213 name: Optional[str] = None, 1214 input: Optional[Any] = None, 1215 output: Optional[Any] = None, 1216 metadata: Optional[Any] = None, 1217 version: Optional[str] = None, 1218 level: Optional[SpanLevel] = None, 1219 status_message: Optional[str] = None, 1220 completion_start_time: Optional[datetime] = None, 1221 model: Optional[str] = None, 1222 model_parameters: Optional[Dict[str, MapValue]] = None, 1223 usage_details: Optional[Dict[str, int]] = None, 1224 cost_details: Optional[Dict[str, float]] = None, 1225 prompt: Optional[PromptClient] = None, 1226 ) -> None: 1227 """Update the current active generation span with new information. 1228 1229 This method updates the current generation span in the active context with 1230 additional information. It's useful for adding output, usage stats, or other 1231 details that become available during or after model generation. 1232 1233 Args: 1234 name: The generation name 1235 input: Updated input data for the model 1236 output: Output from the model (e.g., completions) 1237 metadata: Additional metadata to associate with the generation 1238 version: Version identifier for the model or component 1239 level: Importance level of the generation (info, warning, error) 1240 status_message: Optional status message for the generation 1241 completion_start_time: When the model started generating the response 1242 model: Name/identifier of the AI model used (e.g., "gpt-4") 1243 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1244 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1245 cost_details: Cost information for the model call 1246 prompt: Associated prompt template from Langfuse prompt management 1247 1248 Example: 1249 ```python 1250 with langfuse.start_as_current_generation(name="answer-query") as generation: 1251 # Initial setup and API call 1252 response = llm.generate(...) 1253 1254 # Update with results that weren't available at creation time 1255 langfuse.update_current_generation( 1256 output=response.text, 1257 usage_details={ 1258 "prompt_tokens": response.usage.prompt_tokens, 1259 "completion_tokens": response.usage.completion_tokens 1260 } 1261 ) 1262 ``` 1263 """ 1264 if not self._tracing_enabled: 1265 langfuse_logger.debug( 1266 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1267 ) 1268 return 1269 1270 current_otel_span = self._get_current_otel_span() 1271 1272 if current_otel_span is not None: 1273 generation = LangfuseGeneration( 1274 otel_span=current_otel_span, langfuse_client=self 1275 ) 1276 1277 if name: 1278 current_otel_span.update_name(name) 1279 1280 generation.update( 1281 input=input, 1282 output=output, 1283 metadata=metadata, 1284 version=version, 1285 level=level, 1286 status_message=status_message, 1287 completion_start_time=completion_start_time, 1288 model=model, 1289 model_parameters=model_parameters, 1290 usage_details=usage_details, 1291 cost_details=cost_details, 1292 prompt=prompt, 1293 ) 1294 1295 def update_current_span( 1296 self, 1297 *, 1298 name: Optional[str] = None, 1299 input: Optional[Any] = None, 1300 output: Optional[Any] = None, 1301 metadata: Optional[Any] = None, 1302 version: Optional[str] = None, 1303 level: Optional[SpanLevel] = None, 1304 status_message: Optional[str] = None, 1305 ) -> None: 1306 """Update the current active span with new information. 1307 1308 This method updates the current span in the active context with 1309 additional information. It's useful for adding outputs or metadata 1310 that become available during execution. 1311 1312 Args: 1313 name: The span name 1314 input: Updated input data for the operation 1315 output: Output data from the operation 1316 metadata: Additional metadata to associate with the span 1317 version: Version identifier for the code or component 1318 level: Importance level of the span (info, warning, error) 1319 status_message: Optional status message for the span 1320 1321 Example: 1322 ```python 1323 with langfuse.start_as_current_observation(name="process-data") as span: 1324 # Initial processing 1325 result = process_first_part() 1326 1327 # Update with intermediate results 1328 langfuse.update_current_span(metadata={"intermediate_result": result}) 1329 1330 # Continue processing 1331 final_result = process_second_part(result) 1332 1333 # Final update 1334 langfuse.update_current_span(output=final_result) 1335 ``` 1336 """ 1337 if not self._tracing_enabled: 1338 langfuse_logger.debug( 1339 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1340 ) 1341 return 1342 1343 current_otel_span = self._get_current_otel_span() 1344 1345 if current_otel_span is not None: 1346 span = LangfuseSpan( 1347 otel_span=current_otel_span, 1348 langfuse_client=self, 1349 environment=self._environment, 1350 ) 1351 1352 if name: 1353 current_otel_span.update_name(name) 1354 1355 span.update( 1356 input=input, 1357 output=output, 1358 metadata=metadata, 1359 version=version, 1360 level=level, 1361 status_message=status_message, 1362 ) 1363 1364 @deprecated( 1365 "Trace-level input/output is deprecated. " 1366 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1367 "This method will be removed in a future major version." 1368 ) 1369 def set_current_trace_io( 1370 self, 1371 *, 1372 input: Optional[Any] = None, 1373 output: Optional[Any] = None, 1374 ) -> None: 1375 """Set trace-level input and output for the current span's trace. 1376 1377 .. deprecated:: 1378 This is a legacy method for backward compatibility with Langfuse platform 1379 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1380 evaluators). It will be removed in a future major version. 1381 1382 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1383 use :meth:`propagate_attributes` instead. 1384 1385 Args: 1386 input: Input data to associate with the trace. 1387 output: Output data to associate with the trace. 1388 """ 1389 if not self._tracing_enabled: 1390 langfuse_logger.debug( 1391 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1392 ) 1393 return 1394 1395 current_otel_span = self._get_current_otel_span() 1396 1397 if current_otel_span is not None and current_otel_span.is_recording(): 1398 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1399 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1400 ) 1401 # We need to preserve the class to keep the correct observation type 1402 span_class = self._get_span_class(existing_observation_type) 1403 span = span_class( 1404 otel_span=current_otel_span, 1405 langfuse_client=self, 1406 environment=self._environment, 1407 ) 1408 1409 span.set_trace_io( 1410 input=input, 1411 output=output, 1412 ) 1413 1414 def set_current_trace_as_public(self) -> None: 1415 """Make the current trace publicly accessible via its URL. 1416 1417 When a trace is published, anyone with the trace link can view the full trace 1418 without needing to be logged in to Langfuse. This action cannot be undone 1419 programmatically - once published, the entire trace becomes public. 1420 1421 This is a convenience method that publishes the trace from the currently 1422 active span context. Use this when you want to make a trace public from 1423 within a traced function without needing direct access to the span object. 1424 """ 1425 if not self._tracing_enabled: 1426 langfuse_logger.debug( 1427 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1428 ) 1429 return 1430 1431 current_otel_span = self._get_current_otel_span() 1432 1433 if current_otel_span is not None and current_otel_span.is_recording(): 1434 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1435 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1436 ) 1437 # We need to preserve the class to keep the correct observation type 1438 span_class = self._get_span_class(existing_observation_type) 1439 span = span_class( 1440 otel_span=current_otel_span, 1441 langfuse_client=self, 1442 environment=self._environment, 1443 ) 1444 1445 span.set_trace_as_public() 1446 1447 def create_event( 1448 self, 1449 *, 1450 trace_context: Optional[TraceContext] = None, 1451 name: str, 1452 input: Optional[Any] = None, 1453 output: Optional[Any] = None, 1454 metadata: Optional[Any] = None, 1455 version: Optional[str] = None, 1456 level: Optional[SpanLevel] = None, 1457 status_message: Optional[str] = None, 1458 ) -> LangfuseEvent: 1459 """Create a new Langfuse observation of type 'EVENT'. 1460 1461 The created Langfuse Event observation will be the child of the current span in the context. 1462 1463 Args: 1464 trace_context: Optional context for connecting to an existing trace 1465 name: Name of the span (e.g., function or operation name) 1466 input: Input data for the operation (can be any JSON-serializable object) 1467 output: Output data from the operation (can be any JSON-serializable object) 1468 metadata: Additional metadata to associate with the span 1469 version: Version identifier for the code or component 1470 level: Importance level of the span (info, warning, error) 1471 status_message: Optional status message for the span 1472 1473 Returns: 1474 The Langfuse Event object 1475 1476 Example: 1477 ```python 1478 event = langfuse.create_event(name="process-event") 1479 ``` 1480 """ 1481 timestamp = time_ns() 1482 1483 if trace_context: 1484 trace_id = trace_context.get("trace_id", None) 1485 parent_span_id = trace_context.get("parent_span_id", None) 1486 1487 if trace_id: 1488 remote_parent_span = self._create_remote_parent_span( 1489 trace_id=trace_id, parent_span_id=parent_span_id 1490 ) 1491 1492 with otel_trace_api.use_span( 1493 cast(otel_trace_api.Span, remote_parent_span) 1494 ): 1495 otel_span = self._otel_tracer.start_span( 1496 name=name, start_time=timestamp 1497 ) 1498 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1499 1500 return cast( 1501 LangfuseEvent, 1502 LangfuseEvent( 1503 otel_span=otel_span, 1504 langfuse_client=self, 1505 environment=self._environment, 1506 input=input, 1507 output=output, 1508 metadata=metadata, 1509 version=version, 1510 level=level, 1511 status_message=status_message, 1512 ).end(end_time=timestamp), 1513 ) 1514 1515 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1516 1517 return cast( 1518 LangfuseEvent, 1519 LangfuseEvent( 1520 otel_span=otel_span, 1521 langfuse_client=self, 1522 environment=self._environment, 1523 input=input, 1524 output=output, 1525 metadata=metadata, 1526 version=version, 1527 level=level, 1528 status_message=status_message, 1529 ).end(end_time=timestamp), 1530 ) 1531 1532 def _create_remote_parent_span( 1533 self, *, trace_id: str, parent_span_id: Optional[str] 1534 ) -> Any: 1535 if not self._is_valid_trace_id(trace_id): 1536 langfuse_logger.warning( 1537 f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID." 1538 ) 1539 1540 if parent_span_id and not self._is_valid_span_id(parent_span_id): 1541 langfuse_logger.warning( 1542 f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID." 1543 ) 1544 1545 int_trace_id = int(trace_id, 16) 1546 int_parent_span_id = ( 1547 int(parent_span_id, 16) 1548 if parent_span_id 1549 else RandomIdGenerator().generate_span_id() 1550 ) 1551 1552 span_context = otel_trace_api.SpanContext( 1553 trace_id=int_trace_id, 1554 span_id=int_parent_span_id, 1555 trace_flags=otel_trace_api.TraceFlags(0x01), # mark span as sampled 1556 is_remote=False, 1557 ) 1558 1559 return otel_trace_api.NonRecordingSpan(span_context) 1560 1561 def _is_valid_trace_id(self, trace_id: str) -> bool: 1562 pattern = r"^[0-9a-f]{32}$" 1563 1564 return bool(re.match(pattern, trace_id)) 1565 1566 def _is_valid_span_id(self, span_id: str) -> bool: 1567 pattern = r"^[0-9a-f]{16}$" 1568 1569 return bool(re.match(pattern, span_id)) 1570 1571 def _create_observation_id(self, *, seed: Optional[str] = None) -> str: 1572 """Create a unique observation ID for use with Langfuse. 1573 1574 This method generates a unique observation ID (span ID in OpenTelemetry terms) 1575 for use with various Langfuse APIs. It can either generate a random ID or 1576 create a deterministic ID based on a seed string. 1577 1578 Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes. 1579 This method ensures the generated ID meets this requirement. If you need to 1580 correlate an external ID with a Langfuse observation ID, use the external ID as 1581 the seed to get a valid, deterministic observation ID. 1582 1583 Args: 1584 seed: Optional string to use as a seed for deterministic ID generation. 1585 If provided, the same seed will always produce the same ID. 1586 If not provided, a random ID will be generated. 1587 1588 Returns: 1589 A 16-character lowercase hexadecimal string representing the observation ID. 1590 1591 Example: 1592 ```python 1593 # Generate a random observation ID 1594 obs_id = langfuse.create_observation_id() 1595 1596 # Generate a deterministic ID based on a seed 1597 user_obs_id = langfuse.create_observation_id(seed="user-123-feedback") 1598 1599 # Correlate an external item ID with a Langfuse observation ID 1600 item_id = "item-789012" 1601 correlated_obs_id = langfuse.create_observation_id(seed=item_id) 1602 1603 # Use the ID with Langfuse APIs 1604 langfuse.create_score( 1605 name="relevance", 1606 value=0.95, 1607 trace_id=trace_id, 1608 observation_id=obs_id 1609 ) 1610 ``` 1611 """ 1612 if not seed: 1613 span_id_int = RandomIdGenerator().generate_span_id() 1614 1615 return self._format_otel_span_id(span_id_int) 1616 1617 return sha256(seed.encode("utf-8")).digest()[:8].hex() 1618 1619 @staticmethod 1620 def create_trace_id(*, seed: Optional[str] = None) -> str: 1621 """Create a unique trace ID for use with Langfuse. 1622 1623 This method generates a unique trace ID for use with various Langfuse APIs. 1624 It can either generate a random ID or create a deterministic ID based on 1625 a seed string. 1626 1627 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1628 This method ensures the generated ID meets this requirement. If you need to 1629 correlate an external ID with a Langfuse trace ID, use the external ID as the 1630 seed to get a valid, deterministic Langfuse trace ID. 1631 1632 Args: 1633 seed: Optional string to use as a seed for deterministic ID generation. 1634 If provided, the same seed will always produce the same ID. 1635 If not provided, a random ID will be generated. 1636 1637 Returns: 1638 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1639 1640 Example: 1641 ```python 1642 # Generate a random trace ID 1643 trace_id = langfuse.create_trace_id() 1644 1645 # Generate a deterministic ID based on a seed 1646 session_trace_id = langfuse.create_trace_id(seed="session-456") 1647 1648 # Correlate an external ID with a Langfuse trace ID 1649 external_id = "external-system-123456" 1650 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1651 1652 # Use the ID with trace context 1653 with langfuse.start_as_current_observation( 1654 name="process-request", 1655 trace_context={"trace_id": trace_id} 1656 ) as span: 1657 # Operation will be part of the specific trace 1658 pass 1659 ``` 1660 """ 1661 if not seed: 1662 trace_id_int = RandomIdGenerator().generate_trace_id() 1663 1664 return Langfuse._format_otel_trace_id(trace_id_int) 1665 1666 return sha256(seed.encode("utf-8")).digest()[:16].hex() 1667 1668 def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str: 1669 span_context = otel_span.get_span_context() 1670 1671 return self._format_otel_trace_id(span_context.trace_id) 1672 1673 def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str: 1674 span_context = otel_span.get_span_context() 1675 1676 return self._format_otel_span_id(span_context.span_id) 1677 1678 @staticmethod 1679 def _format_otel_span_id(span_id_int: int) -> str: 1680 """Format an integer span ID to a 16-character lowercase hex string. 1681 1682 Internal method to convert an OpenTelemetry integer span ID to the standard 1683 W3C Trace Context format (16-character lowercase hex string). 1684 1685 Args: 1686 span_id_int: 64-bit integer representing a span ID 1687 1688 Returns: 1689 A 16-character lowercase hexadecimal string 1690 """ 1691 return format(span_id_int, "016x") 1692 1693 @staticmethod 1694 def _format_otel_trace_id(trace_id_int: int) -> str: 1695 """Format an integer trace ID to a 32-character lowercase hex string. 1696 1697 Internal method to convert an OpenTelemetry integer trace ID to the standard 1698 W3C Trace Context format (32-character lowercase hex string). 1699 1700 Args: 1701 trace_id_int: 128-bit integer representing a trace ID 1702 1703 Returns: 1704 A 32-character lowercase hexadecimal string 1705 """ 1706 return format(trace_id_int, "032x") 1707 1708 @overload 1709 def create_score( 1710 self, 1711 *, 1712 name: str, 1713 value: float, 1714 session_id: Optional[str] = None, 1715 dataset_run_id: Optional[str] = None, 1716 trace_id: Optional[str] = None, 1717 observation_id: Optional[str] = None, 1718 score_id: Optional[str] = None, 1719 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1720 comment: Optional[str] = None, 1721 config_id: Optional[str] = None, 1722 metadata: Optional[Any] = None, 1723 timestamp: Optional[datetime] = None, 1724 ) -> None: ... 1725 1726 @overload 1727 def create_score( 1728 self, 1729 *, 1730 name: str, 1731 value: str, 1732 session_id: Optional[str] = None, 1733 dataset_run_id: Optional[str] = None, 1734 trace_id: Optional[str] = None, 1735 score_id: Optional[str] = None, 1736 observation_id: Optional[str] = None, 1737 data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL", 1738 comment: Optional[str] = None, 1739 config_id: Optional[str] = None, 1740 metadata: Optional[Any] = None, 1741 timestamp: Optional[datetime] = None, 1742 ) -> None: ... 1743 1744 def create_score( 1745 self, 1746 *, 1747 name: str, 1748 value: Union[float, str], 1749 session_id: Optional[str] = None, 1750 dataset_run_id: Optional[str] = None, 1751 trace_id: Optional[str] = None, 1752 observation_id: Optional[str] = None, 1753 score_id: Optional[str] = None, 1754 data_type: Optional[ScoreDataType] = None, 1755 comment: Optional[str] = None, 1756 config_id: Optional[str] = None, 1757 metadata: Optional[Any] = None, 1758 timestamp: Optional[datetime] = None, 1759 ) -> None: 1760 """Create a score for a specific trace or observation. 1761 1762 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1763 used to track quality metrics, user feedback, or automated evaluations. 1764 1765 Args: 1766 name: Name of the score (e.g., "relevance", "accuracy") 1767 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 1768 session_id: ID of the Langfuse session to associate the score with 1769 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1770 trace_id: ID of the Langfuse trace to associate the score with 1771 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1772 score_id: Optional custom ID for the score (auto-generated if not provided) 1773 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 1774 comment: Optional comment or explanation for the score 1775 config_id: Optional ID of a score config defined in Langfuse 1776 metadata: Optional metadata to be attached to the score 1777 timestamp: Optional timestamp for the score (defaults to current UTC time) 1778 1779 Example: 1780 ```python 1781 # Create a numeric score for accuracy 1782 langfuse.create_score( 1783 name="accuracy", 1784 value=0.92, 1785 trace_id="abcdef1234567890abcdef1234567890", 1786 data_type="NUMERIC", 1787 comment="High accuracy with minor irrelevant details" 1788 ) 1789 1790 # Create a categorical score for sentiment 1791 langfuse.create_score( 1792 name="sentiment", 1793 value="positive", 1794 trace_id="abcdef1234567890abcdef1234567890", 1795 observation_id="abcdef1234567890", 1796 data_type="CATEGORICAL" 1797 ) 1798 ``` 1799 """ 1800 if not self._tracing_enabled: 1801 return 1802 1803 score_id = score_id or self._create_observation_id() 1804 1805 try: 1806 new_body = ScoreBody( 1807 id=score_id, 1808 session_id=session_id, 1809 datasetRunId=dataset_run_id, 1810 traceId=trace_id, 1811 observationId=observation_id, 1812 name=name, 1813 value=value, 1814 dataType=data_type, # type: ignore 1815 comment=comment, 1816 configId=config_id, 1817 environment=self._environment, 1818 metadata=metadata, 1819 ) 1820 1821 event = { 1822 "id": self.create_trace_id(), 1823 "type": "score-create", 1824 "timestamp": timestamp or _get_timestamp(), 1825 "body": new_body, 1826 } 1827 1828 if self._resources is not None: 1829 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1830 force_sample = ( 1831 not self._is_valid_trace_id(trace_id) if trace_id else True 1832 ) 1833 1834 self._resources.add_score_task( 1835 event, 1836 force_sample=force_sample, 1837 ) 1838 1839 except Exception as e: 1840 langfuse_logger.exception( 1841 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1842 ) 1843 1844 def _create_trace_tags_via_ingestion( 1845 self, 1846 *, 1847 trace_id: str, 1848 tags: List[str], 1849 ) -> None: 1850 """Private helper to enqueue trace tag updates via ingestion API events.""" 1851 if not self._tracing_enabled: 1852 return 1853 1854 if len(tags) == 0: 1855 return 1856 1857 try: 1858 new_body = TraceBody( 1859 id=trace_id, 1860 tags=tags, 1861 ) 1862 1863 event = { 1864 "id": self.create_trace_id(), 1865 "type": "trace-create", 1866 "timestamp": _get_timestamp(), 1867 "body": new_body, 1868 } 1869 1870 if self._resources is not None: 1871 self._resources.add_trace_task(event) 1872 except Exception as e: 1873 langfuse_logger.exception( 1874 f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}" 1875 ) 1876 1877 @overload 1878 def score_current_span( 1879 self, 1880 *, 1881 name: str, 1882 value: float, 1883 score_id: Optional[str] = None, 1884 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1885 comment: Optional[str] = None, 1886 config_id: Optional[str] = None, 1887 metadata: Optional[Any] = None, 1888 ) -> None: ... 1889 1890 @overload 1891 def score_current_span( 1892 self, 1893 *, 1894 name: str, 1895 value: str, 1896 score_id: Optional[str] = None, 1897 data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL", 1898 comment: Optional[str] = None, 1899 config_id: Optional[str] = None, 1900 metadata: Optional[Any] = None, 1901 ) -> None: ... 1902 1903 def score_current_span( 1904 self, 1905 *, 1906 name: str, 1907 value: Union[float, str], 1908 score_id: Optional[str] = None, 1909 data_type: Optional[ScoreDataType] = None, 1910 comment: Optional[str] = None, 1911 config_id: Optional[str] = None, 1912 metadata: Optional[Any] = None, 1913 ) -> None: 1914 """Create a score for the current active span. 1915 1916 This method scores the currently active span in the context. It's a convenient 1917 way to score the current operation without needing to know its trace and span IDs. 1918 1919 Args: 1920 name: Name of the score (e.g., "relevance", "accuracy") 1921 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 1922 score_id: Optional custom ID for the score (auto-generated if not provided) 1923 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 1924 comment: Optional comment or explanation for the score 1925 config_id: Optional ID of a score config defined in Langfuse 1926 metadata: Optional metadata to be attached to the score 1927 1928 Example: 1929 ```python 1930 with langfuse.start_as_current_generation(name="answer-query") as generation: 1931 # Generate answer 1932 response = generate_answer(...) 1933 generation.update(output=response) 1934 1935 # Score the generation 1936 langfuse.score_current_span( 1937 name="relevance", 1938 value=0.85, 1939 data_type="NUMERIC", 1940 comment="Mostly relevant but contains some tangential information", 1941 metadata={"model": "gpt-4", "prompt_version": "v2"} 1942 ) 1943 ``` 1944 """ 1945 current_span = self._get_current_otel_span() 1946 1947 if current_span is not None: 1948 trace_id = self._get_otel_trace_id(current_span) 1949 observation_id = self._get_otel_span_id(current_span) 1950 1951 langfuse_logger.info( 1952 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 1953 ) 1954 1955 self.create_score( 1956 trace_id=trace_id, 1957 observation_id=observation_id, 1958 name=name, 1959 value=cast(str, value), 1960 score_id=score_id, 1961 data_type=cast(Literal["CATEGORICAL"], data_type), 1962 comment=comment, 1963 config_id=config_id, 1964 metadata=metadata, 1965 ) 1966 1967 @overload 1968 def score_current_trace( 1969 self, 1970 *, 1971 name: str, 1972 value: float, 1973 score_id: Optional[str] = None, 1974 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1975 comment: Optional[str] = None, 1976 config_id: Optional[str] = None, 1977 metadata: Optional[Any] = None, 1978 ) -> None: ... 1979 1980 @overload 1981 def score_current_trace( 1982 self, 1983 *, 1984 name: str, 1985 value: str, 1986 score_id: Optional[str] = None, 1987 data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL", 1988 comment: Optional[str] = None, 1989 config_id: Optional[str] = None, 1990 metadata: Optional[Any] = None, 1991 ) -> None: ... 1992 1993 def score_current_trace( 1994 self, 1995 *, 1996 name: str, 1997 value: Union[float, str], 1998 score_id: Optional[str] = None, 1999 data_type: Optional[ScoreDataType] = None, 2000 comment: Optional[str] = None, 2001 config_id: Optional[str] = None, 2002 metadata: Optional[Any] = None, 2003 ) -> None: 2004 """Create a score for the current trace. 2005 2006 This method scores the trace of the currently active span. Unlike score_current_span, 2007 this method associates the score with the entire trace rather than a specific span. 2008 It's useful for scoring overall performance or quality of the entire operation. 2009 2010 Args: 2011 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2012 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 2013 score_id: Optional custom ID for the score (auto-generated if not provided) 2014 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 2015 comment: Optional comment or explanation for the score 2016 config_id: Optional ID of a score config defined in Langfuse 2017 metadata: Optional metadata to be attached to the score 2018 2019 Example: 2020 ```python 2021 with langfuse.start_as_current_observation(name="process-user-request") as span: 2022 # Process request 2023 result = process_complete_request() 2024 span.update(output=result) 2025 2026 # Score the overall trace 2027 langfuse.score_current_trace( 2028 name="overall_quality", 2029 value=0.95, 2030 data_type="NUMERIC", 2031 comment="High quality end-to-end response", 2032 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2033 ) 2034 ``` 2035 """ 2036 current_span = self._get_current_otel_span() 2037 2038 if current_span is not None: 2039 trace_id = self._get_otel_trace_id(current_span) 2040 2041 langfuse_logger.info( 2042 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2043 ) 2044 2045 self.create_score( 2046 trace_id=trace_id, 2047 name=name, 2048 value=cast(str, value), 2049 score_id=score_id, 2050 data_type=cast(Literal["CATEGORICAL"], data_type), 2051 comment=comment, 2052 config_id=config_id, 2053 metadata=metadata, 2054 ) 2055 2056 def flush(self) -> None: 2057 """Force flush all pending spans and events to the Langfuse API. 2058 2059 This method manually flushes any pending spans, scores, and other events to the 2060 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2061 before proceeding, without waiting for the automatic flush interval. 2062 2063 Example: 2064 ```python 2065 # Record some spans and scores 2066 with langfuse.start_as_current_observation(name="operation") as span: 2067 # Do work... 2068 pass 2069 2070 # Ensure all data is sent to Langfuse before proceeding 2071 langfuse.flush() 2072 2073 # Continue with other work 2074 ``` 2075 """ 2076 if self._resources is not None: 2077 self._resources.flush() 2078 2079 def shutdown(self) -> None: 2080 """Shut down the Langfuse client and flush all pending data. 2081 2082 This method cleanly shuts down the Langfuse client, ensuring all pending data 2083 is flushed to the API and all background threads are properly terminated. 2084 2085 It's important to call this method when your application is shutting down to 2086 prevent data loss and resource leaks. For most applications, using the client 2087 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2088 2089 Example: 2090 ```python 2091 # Initialize Langfuse 2092 langfuse = Langfuse(public_key="...", secret_key="...") 2093 2094 # Use Langfuse throughout your application 2095 # ... 2096 2097 # When application is shutting down 2098 langfuse.shutdown() 2099 ``` 2100 """ 2101 if self._resources is not None: 2102 self._resources.shutdown() 2103 2104 def get_current_trace_id(self) -> Optional[str]: 2105 """Get the trace ID of the current active span. 2106 2107 This method retrieves the trace ID from the currently active span in the context. 2108 It can be used to get the trace ID for referencing in logs, external systems, 2109 or for creating related operations. 2110 2111 Returns: 2112 The current trace ID as a 32-character lowercase hexadecimal string, 2113 or None if there is no active span. 2114 2115 Example: 2116 ```python 2117 with langfuse.start_as_current_observation(name="process-request") as span: 2118 # Get the current trace ID for reference 2119 trace_id = langfuse.get_current_trace_id() 2120 2121 # Use it for external correlation 2122 log.info(f"Processing request with trace_id: {trace_id}") 2123 2124 # Or pass to another system 2125 external_system.process(data, trace_id=trace_id) 2126 ``` 2127 """ 2128 if not self._tracing_enabled: 2129 langfuse_logger.debug( 2130 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2131 ) 2132 return None 2133 2134 current_otel_span = self._get_current_otel_span() 2135 2136 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None 2137 2138 def get_current_observation_id(self) -> Optional[str]: 2139 """Get the observation ID (span ID) of the current active span. 2140 2141 This method retrieves the observation ID from the currently active span in the context. 2142 It can be used to get the observation ID for referencing in logs, external systems, 2143 or for creating scores or other related operations. 2144 2145 Returns: 2146 The current observation ID as a 16-character lowercase hexadecimal string, 2147 or None if there is no active span. 2148 2149 Example: 2150 ```python 2151 with langfuse.start_as_current_observation(name="process-user-query") as span: 2152 # Get the current observation ID 2153 observation_id = langfuse.get_current_observation_id() 2154 2155 # Store it for later reference 2156 cache.set(f"query_{query_id}_observation", observation_id) 2157 2158 # Process the query... 2159 ``` 2160 """ 2161 if not self._tracing_enabled: 2162 langfuse_logger.debug( 2163 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2164 ) 2165 return None 2166 2167 current_otel_span = self._get_current_otel_span() 2168 2169 return self._get_otel_span_id(current_otel_span) if current_otel_span else None 2170 2171 def _get_project_id(self) -> Optional[str]: 2172 """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys.""" 2173 if not self._project_id: 2174 proj = self.api.projects.get() 2175 if not proj.data or not proj.data[0].id: 2176 return None 2177 2178 self._project_id = proj.data[0].id 2179 2180 return self._project_id 2181 2182 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2183 """Get the URL to view a trace in the Langfuse UI. 2184 2185 This method generates a URL that links directly to a trace in the Langfuse UI. 2186 It's useful for providing links in logs, notifications, or debugging tools. 2187 2188 Args: 2189 trace_id: Optional trace ID to generate a URL for. If not provided, 2190 the trace ID of the current active span will be used. 2191 2192 Returns: 2193 A URL string pointing to the trace in the Langfuse UI, 2194 or None if the project ID couldn't be retrieved or no trace ID is available. 2195 2196 Example: 2197 ```python 2198 # Get URL for the current trace 2199 with langfuse.start_as_current_observation(name="process-request") as span: 2200 trace_url = langfuse.get_trace_url() 2201 log.info(f"Processing trace: {trace_url}") 2202 2203 # Get URL for a specific trace 2204 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2205 send_notification(f"Review needed for trace: {specific_trace_url}") 2206 ``` 2207 """ 2208 final_trace_id = trace_id or self.get_current_trace_id() 2209 if not final_trace_id: 2210 return None 2211 2212 project_id = self._get_project_id() 2213 2214 return ( 2215 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2216 if project_id and final_trace_id 2217 else None 2218 ) 2219 2220 def get_dataset( 2221 self, 2222 name: str, 2223 *, 2224 fetch_items_page_size: Optional[int] = 50, 2225 version: Optional[datetime] = None, 2226 ) -> "DatasetClient": 2227 """Fetch a dataset by its name. 2228 2229 Args: 2230 name (str): The name of the dataset to fetch. 2231 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2232 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2233 If provided, returns the state of items at the specified UTC timestamp. 2234 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2235 2236 Returns: 2237 DatasetClient: The dataset with the given name. 2238 """ 2239 try: 2240 langfuse_logger.debug(f"Getting datasets {name}") 2241 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2242 2243 dataset_items = [] 2244 page = 1 2245 2246 while True: 2247 new_items = self.api.dataset_items.list( 2248 dataset_name=self._url_encode(name, is_url_param=True), 2249 page=page, 2250 limit=fetch_items_page_size, 2251 version=version, 2252 ) 2253 dataset_items.extend(new_items.data) 2254 2255 if new_items.meta.total_pages <= page: 2256 break 2257 2258 page += 1 2259 2260 return DatasetClient( 2261 dataset=dataset, 2262 items=dataset_items, 2263 version=version, 2264 langfuse_client=self, 2265 ) 2266 2267 except Error as e: 2268 handle_fern_exception(e) 2269 raise e 2270 2271 def get_dataset_run( 2272 self, *, dataset_name: str, run_name: str 2273 ) -> DatasetRunWithItems: 2274 """Fetch a dataset run by dataset name and run name. 2275 2276 Args: 2277 dataset_name (str): The name of the dataset. 2278 run_name (str): The name of the run. 2279 2280 Returns: 2281 DatasetRunWithItems: The dataset run with its items. 2282 """ 2283 try: 2284 return cast( 2285 DatasetRunWithItems, 2286 self.api.datasets.get_run( 2287 dataset_name=self._url_encode(dataset_name), 2288 run_name=self._url_encode(run_name), 2289 request_options=None, 2290 ), 2291 ) 2292 except Error as e: 2293 handle_fern_exception(e) 2294 raise e 2295 2296 def get_dataset_runs( 2297 self, 2298 *, 2299 dataset_name: str, 2300 page: Optional[int] = None, 2301 limit: Optional[int] = None, 2302 ) -> PaginatedDatasetRuns: 2303 """Fetch all runs for a dataset. 2304 2305 Args: 2306 dataset_name (str): The name of the dataset. 2307 page (Optional[int]): Page number, starts at 1. 2308 limit (Optional[int]): Limit of items per page. 2309 2310 Returns: 2311 PaginatedDatasetRuns: Paginated list of dataset runs. 2312 """ 2313 try: 2314 return cast( 2315 PaginatedDatasetRuns, 2316 self.api.datasets.get_runs( 2317 dataset_name=self._url_encode(dataset_name), 2318 page=page, 2319 limit=limit, 2320 request_options=None, 2321 ), 2322 ) 2323 except Error as e: 2324 handle_fern_exception(e) 2325 raise e 2326 2327 def delete_dataset_run( 2328 self, *, dataset_name: str, run_name: str 2329 ) -> DeleteDatasetRunResponse: 2330 """Delete a dataset run and all its run items. This action is irreversible. 2331 2332 Args: 2333 dataset_name (str): The name of the dataset. 2334 run_name (str): The name of the run. 2335 2336 Returns: 2337 DeleteDatasetRunResponse: Confirmation of deletion. 2338 """ 2339 try: 2340 return cast( 2341 DeleteDatasetRunResponse, 2342 self.api.datasets.delete_run( 2343 dataset_name=self._url_encode(dataset_name), 2344 run_name=self._url_encode(run_name), 2345 request_options=None, 2346 ), 2347 ) 2348 except Error as e: 2349 handle_fern_exception(e) 2350 raise e 2351 2352 def run_experiment( 2353 self, 2354 *, 2355 name: str, 2356 run_name: Optional[str] = None, 2357 description: Optional[str] = None, 2358 data: ExperimentData, 2359 task: TaskFunction, 2360 evaluators: List[EvaluatorFunction] = [], 2361 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2362 run_evaluators: List[RunEvaluatorFunction] = [], 2363 max_concurrency: int = 50, 2364 metadata: Optional[Dict[str, str]] = None, 2365 _dataset_version: Optional[datetime] = None, 2366 ) -> ExperimentResult: 2367 """Run an experiment on a dataset with automatic tracing and evaluation. 2368 2369 This method executes a task function on each item in the provided dataset, 2370 automatically traces all executions with Langfuse for observability, runs 2371 item-level and run-level evaluators on the outputs, and returns comprehensive 2372 results with evaluation metrics. 2373 2374 The experiment system provides: 2375 - Automatic tracing of all task executions 2376 - Concurrent processing with configurable limits 2377 - Comprehensive error handling that isolates failures 2378 - Integration with Langfuse datasets for experiment tracking 2379 - Flexible evaluation framework supporting both sync and async evaluators 2380 2381 Args: 2382 name: Human-readable name for the experiment. Used for identification 2383 in the Langfuse UI. 2384 run_name: Optional exact name for the experiment run. If provided, this will be 2385 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2386 If not provided, this will default to the experiment name appended with an ISO timestamp. 2387 description: Optional description explaining the experiment's purpose, 2388 methodology, or expected outcomes. 2389 data: Array of data items to process. Can be either: 2390 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2391 - List of Langfuse DatasetItem objects from dataset.items 2392 task: Function that processes each data item and returns output. 2393 Must accept 'item' as keyword argument and can return sync or async results. 2394 The task function signature should be: task(*, item, **kwargs) -> Any 2395 evaluators: List of functions to evaluate each item's output individually. 2396 Each evaluator receives input, output, expected_output, and metadata. 2397 Can return single Evaluation dict or list of Evaluation dicts. 2398 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2399 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2400 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2401 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2402 run_evaluators: List of functions to evaluate the entire experiment run. 2403 Each run evaluator receives all item_results and can compute aggregate metrics. 2404 Useful for calculating averages, distributions, or cross-item comparisons. 2405 max_concurrency: Maximum number of concurrent task executions (default: 50). 2406 Controls the number of items processed simultaneously. Adjust based on 2407 API rate limits and system resources. 2408 metadata: Optional metadata dictionary to attach to all experiment traces. 2409 This metadata will be included in every trace created during the experiment. 2410 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2411 2412 Returns: 2413 ExperimentResult containing: 2414 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2415 - item_results: List of results for each processed item with outputs and evaluations 2416 - run_evaluations: List of aggregate evaluation results for the entire run 2417 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2418 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2419 2420 Raises: 2421 ValueError: If required parameters are missing or invalid 2422 Exception: If experiment setup fails (individual item failures are handled gracefully) 2423 2424 Examples: 2425 Basic experiment with local data: 2426 ```python 2427 def summarize_text(*, item, **kwargs): 2428 return f"Summary: {item['input'][:50]}..." 2429 2430 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2431 return { 2432 "name": "output_length", 2433 "value": len(output), 2434 "comment": f"Output contains {len(output)} characters" 2435 } 2436 2437 result = langfuse.run_experiment( 2438 name="Text Summarization Test", 2439 description="Evaluate summarization quality and length", 2440 data=[ 2441 {"input": "Long article text...", "expected_output": "Expected summary"}, 2442 {"input": "Another article...", "expected_output": "Another summary"} 2443 ], 2444 task=summarize_text, 2445 evaluators=[length_evaluator] 2446 ) 2447 2448 print(f"Processed {len(result.item_results)} items") 2449 for item_result in result.item_results: 2450 print(f"Input: {item_result.item['input']}") 2451 print(f"Output: {item_result.output}") 2452 print(f"Evaluations: {item_result.evaluations}") 2453 ``` 2454 2455 Advanced experiment with async task and multiple evaluators: 2456 ```python 2457 async def llm_task(*, item, **kwargs): 2458 # Simulate async LLM call 2459 response = await openai_client.chat.completions.create( 2460 model="gpt-4", 2461 messages=[{"role": "user", "content": item["input"]}] 2462 ) 2463 return response.choices[0].message.content 2464 2465 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2466 if expected_output and expected_output.lower() in output.lower(): 2467 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2468 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2469 2470 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2471 # Simulate toxicity check 2472 toxicity_score = check_toxicity(output) # Your toxicity checker 2473 return { 2474 "name": "toxicity", 2475 "value": toxicity_score, 2476 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2477 } 2478 2479 def average_accuracy(*, item_results, **kwargs): 2480 accuracies = [ 2481 eval.value for result in item_results 2482 for eval in result.evaluations 2483 if eval.name == "accuracy" 2484 ] 2485 return { 2486 "name": "average_accuracy", 2487 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2488 "comment": f"Average accuracy across {len(accuracies)} items" 2489 } 2490 2491 result = langfuse.run_experiment( 2492 name="LLM Safety and Accuracy Test", 2493 description="Evaluate model accuracy and safety across diverse prompts", 2494 data=test_dataset, # Your dataset items 2495 task=llm_task, 2496 evaluators=[accuracy_evaluator, toxicity_evaluator], 2497 run_evaluators=[average_accuracy], 2498 max_concurrency=5, # Limit concurrent API calls 2499 metadata={"model": "gpt-4", "temperature": 0.7} 2500 ) 2501 ``` 2502 2503 Using with Langfuse datasets: 2504 ```python 2505 # Get dataset from Langfuse 2506 dataset = langfuse.get_dataset("my-eval-dataset") 2507 2508 result = dataset.run_experiment( 2509 name="Production Model Evaluation", 2510 description="Monthly evaluation of production model performance", 2511 task=my_production_task, 2512 evaluators=[accuracy_evaluator, latency_evaluator] 2513 ) 2514 2515 # Results automatically linked to dataset in Langfuse UI 2516 print(f"View results: {result['dataset_run_url']}") 2517 ``` 2518 2519 Note: 2520 - Task and evaluator functions can be either synchronous or asynchronous 2521 - Individual item failures are logged but don't stop the experiment 2522 - All executions are automatically traced and visible in Langfuse UI 2523 - When using Langfuse datasets, results are automatically linked for easy comparison 2524 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2525 - Async execution is handled automatically with smart event loop detection 2526 """ 2527 return cast( 2528 ExperimentResult, 2529 run_async_safely( 2530 self._run_experiment_async( 2531 name=name, 2532 run_name=self._create_experiment_run_name( 2533 name=name, run_name=run_name 2534 ), 2535 description=description, 2536 data=data, 2537 task=task, 2538 evaluators=evaluators or [], 2539 composite_evaluator=composite_evaluator, 2540 run_evaluators=run_evaluators or [], 2541 max_concurrency=max_concurrency, 2542 metadata=metadata, 2543 dataset_version=_dataset_version, 2544 ), 2545 ), 2546 ) 2547 2548 async def _run_experiment_async( 2549 self, 2550 *, 2551 name: str, 2552 run_name: str, 2553 description: Optional[str], 2554 data: ExperimentData, 2555 task: TaskFunction, 2556 evaluators: List[EvaluatorFunction], 2557 composite_evaluator: Optional[CompositeEvaluatorFunction], 2558 run_evaluators: List[RunEvaluatorFunction], 2559 max_concurrency: int, 2560 metadata: Optional[Dict[str, Any]] = None, 2561 dataset_version: Optional[datetime] = None, 2562 ) -> ExperimentResult: 2563 langfuse_logger.debug( 2564 f"Starting experiment '{name}' run '{run_name}' with {len(data)} items" 2565 ) 2566 2567 # Set up concurrency control 2568 semaphore = asyncio.Semaphore(max_concurrency) 2569 2570 # Process all items 2571 async def process_item(item: ExperimentItem) -> ExperimentItemResult: 2572 async with semaphore: 2573 return await self._process_experiment_item( 2574 item, 2575 task, 2576 evaluators, 2577 composite_evaluator, 2578 name, 2579 run_name, 2580 description, 2581 metadata, 2582 dataset_version, 2583 ) 2584 2585 # Run all items concurrently 2586 tasks = [process_item(item) for item in data] 2587 item_results = await asyncio.gather(*tasks, return_exceptions=True) 2588 2589 # Filter out any exceptions and log errors 2590 valid_results: List[ExperimentItemResult] = [] 2591 for i, result in enumerate(item_results): 2592 if isinstance(result, Exception): 2593 langfuse_logger.error(f"Item {i} failed: {result}") 2594 elif isinstance(result, ExperimentItemResult): 2595 valid_results.append(result) # type: ignore 2596 2597 # Run experiment-level evaluators 2598 run_evaluations: List[Evaluation] = [] 2599 for run_evaluator in run_evaluators: 2600 try: 2601 evaluations = await _run_evaluator( 2602 run_evaluator, item_results=valid_results 2603 ) 2604 run_evaluations.extend(evaluations) 2605 except Exception as e: 2606 langfuse_logger.error(f"Run evaluator failed: {e}") 2607 2608 # Generate dataset run URL if applicable 2609 dataset_run_id = valid_results[0].dataset_run_id if valid_results else None 2610 dataset_run_url = None 2611 if dataset_run_id and data: 2612 try: 2613 # Check if the first item has dataset_id (for DatasetItem objects) 2614 first_item = data[0] 2615 dataset_id = None 2616 2617 if hasattr(first_item, "dataset_id"): 2618 dataset_id = getattr(first_item, "dataset_id", None) 2619 2620 if dataset_id: 2621 project_id = self._get_project_id() 2622 2623 if project_id: 2624 dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" 2625 2626 except Exception: 2627 pass # URL generation is optional 2628 2629 # Store run-level evaluations as scores 2630 for evaluation in run_evaluations: 2631 try: 2632 if dataset_run_id: 2633 self.create_score( 2634 dataset_run_id=dataset_run_id, 2635 name=evaluation.name or "<unknown>", 2636 value=evaluation.value, # type: ignore 2637 comment=evaluation.comment, 2638 metadata=evaluation.metadata, 2639 data_type=evaluation.data_type, # type: ignore 2640 config_id=evaluation.config_id, 2641 ) 2642 2643 except Exception as e: 2644 langfuse_logger.error(f"Failed to store run evaluation: {e}") 2645 2646 # Flush scores and traces 2647 self.flush() 2648 2649 return ExperimentResult( 2650 name=name, 2651 run_name=run_name, 2652 description=description, 2653 item_results=valid_results, 2654 run_evaluations=run_evaluations, 2655 dataset_run_id=dataset_run_id, 2656 dataset_run_url=dataset_run_url, 2657 ) 2658 2659 async def _process_experiment_item( 2660 self, 2661 item: ExperimentItem, 2662 task: Callable, 2663 evaluators: List[Callable], 2664 composite_evaluator: Optional[CompositeEvaluatorFunction], 2665 experiment_name: str, 2666 experiment_run_name: str, 2667 experiment_description: Optional[str], 2668 experiment_metadata: Optional[Dict[str, Any]] = None, 2669 dataset_version: Optional[datetime] = None, 2670 ) -> ExperimentItemResult: 2671 span_name = "experiment-item-run" 2672 2673 with self.start_as_current_observation(name=span_name) as span: 2674 try: 2675 input_data = ( 2676 item.get("input") 2677 if isinstance(item, dict) 2678 else getattr(item, "input", None) 2679 ) 2680 2681 if input_data is None: 2682 raise ValueError("Experiment Item is missing input. Skipping item.") 2683 2684 expected_output = ( 2685 item.get("expected_output") 2686 if isinstance(item, dict) 2687 else getattr(item, "expected_output", None) 2688 ) 2689 2690 item_metadata = ( 2691 item.get("metadata") 2692 if isinstance(item, dict) 2693 else getattr(item, "metadata", None) 2694 ) 2695 2696 final_observation_metadata = { 2697 "experiment_name": experiment_name, 2698 "experiment_run_name": experiment_run_name, 2699 **(experiment_metadata or {}), 2700 } 2701 2702 trace_id = span.trace_id 2703 dataset_id = None 2704 dataset_item_id = None 2705 dataset_run_id = None 2706 2707 # Link to dataset run if this is a dataset item 2708 if hasattr(item, "id") and hasattr(item, "dataset_id"): 2709 try: 2710 # Use sync API to avoid event loop issues when run_async_safely 2711 # creates multiple event loops across different threads 2712 dataset_run_item = await asyncio.to_thread( 2713 self.api.dataset_run_items.create, 2714 run_name=experiment_run_name, 2715 run_description=experiment_description, 2716 metadata=experiment_metadata, 2717 dataset_item_id=item.id, # type: ignore 2718 trace_id=trace_id, 2719 observation_id=span.id, 2720 dataset_version=dataset_version, 2721 ) 2722 2723 dataset_run_id = dataset_run_item.dataset_run_id 2724 2725 except Exception as e: 2726 langfuse_logger.error(f"Failed to create dataset run item: {e}") 2727 2728 if ( 2729 not isinstance(item, dict) 2730 and hasattr(item, "dataset_id") 2731 and hasattr(item, "id") 2732 ): 2733 dataset_id = item.dataset_id 2734 dataset_item_id = item.id 2735 2736 final_observation_metadata.update( 2737 {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id} 2738 ) 2739 2740 if isinstance(item_metadata, dict): 2741 final_observation_metadata.update(item_metadata) 2742 2743 experiment_id = dataset_run_id or self._create_observation_id() 2744 experiment_item_id = ( 2745 dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16] 2746 ) 2747 span._otel_span.set_attributes( 2748 { 2749 k: v 2750 for k, v in { 2751 LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT, 2752 LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description, 2753 LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize( 2754 expected_output 2755 ), 2756 }.items() 2757 if v is not None 2758 } 2759 ) 2760 2761 propagated_experiment_attributes = PropagatedExperimentAttributes( 2762 experiment_id=experiment_id, 2763 experiment_name=experiment_run_name, 2764 experiment_metadata=_serialize(experiment_metadata), 2765 experiment_dataset_id=dataset_id, 2766 experiment_item_id=experiment_item_id, 2767 experiment_item_metadata=_serialize(item_metadata), 2768 experiment_item_root_observation_id=span.id, 2769 ) 2770 2771 with _propagate_attributes(experiment=propagated_experiment_attributes): 2772 output = await _run_task(task, item) 2773 2774 span.update( 2775 input=input_data, 2776 output=output, 2777 metadata=final_observation_metadata, 2778 ) 2779 2780 except Exception as e: 2781 span.update( 2782 output=f"Error: {str(e)}", level="ERROR", status_message=str(e) 2783 ) 2784 raise e 2785 2786 # Run evaluators 2787 evaluations = [] 2788 2789 for evaluator in evaluators: 2790 try: 2791 eval_metadata: Optional[Dict[str, Any]] = None 2792 2793 if isinstance(item, dict): 2794 eval_metadata = item.get("metadata") 2795 elif hasattr(item, "metadata"): 2796 eval_metadata = item.metadata 2797 2798 with _propagate_attributes( 2799 experiment=propagated_experiment_attributes 2800 ): 2801 eval_results = await _run_evaluator( 2802 evaluator, 2803 input=input_data, 2804 output=output, 2805 expected_output=expected_output, 2806 metadata=eval_metadata, 2807 ) 2808 evaluations.extend(eval_results) 2809 2810 # Store evaluations as scores 2811 for evaluation in eval_results: 2812 self.create_score( 2813 trace_id=trace_id, 2814 observation_id=span.id, 2815 name=evaluation.name, 2816 value=evaluation.value, # type: ignore 2817 comment=evaluation.comment, 2818 metadata=evaluation.metadata, 2819 config_id=evaluation.config_id, 2820 data_type=evaluation.data_type, # type: ignore 2821 ) 2822 2823 except Exception as e: 2824 langfuse_logger.error(f"Evaluator failed: {e}") 2825 2826 # Run composite evaluator if provided and we have evaluations 2827 if composite_evaluator and evaluations: 2828 try: 2829 composite_eval_metadata: Optional[Dict[str, Any]] = None 2830 if isinstance(item, dict): 2831 composite_eval_metadata = item.get("metadata") 2832 elif hasattr(item, "metadata"): 2833 composite_eval_metadata = item.metadata 2834 2835 with _propagate_attributes( 2836 experiment=propagated_experiment_attributes 2837 ): 2838 result = composite_evaluator( 2839 input=input_data, 2840 output=output, 2841 expected_output=expected_output, 2842 metadata=composite_eval_metadata, 2843 evaluations=evaluations, 2844 ) 2845 2846 # Handle async composite evaluators 2847 if asyncio.iscoroutine(result): 2848 result = await result 2849 2850 # Normalize to list 2851 composite_evals: List[Evaluation] = [] 2852 if isinstance(result, (dict, Evaluation)): 2853 composite_evals = [result] # type: ignore 2854 elif isinstance(result, list): 2855 composite_evals = result # type: ignore 2856 2857 # Store composite evaluations as scores and add to evaluations list 2858 for composite_evaluation in composite_evals: 2859 self.create_score( 2860 trace_id=trace_id, 2861 observation_id=span.id, 2862 name=composite_evaluation.name, 2863 value=composite_evaluation.value, # type: ignore 2864 comment=composite_evaluation.comment, 2865 metadata=composite_evaluation.metadata, 2866 config_id=composite_evaluation.config_id, 2867 data_type=composite_evaluation.data_type, # type: ignore 2868 ) 2869 evaluations.append(composite_evaluation) 2870 2871 except Exception as e: 2872 langfuse_logger.error(f"Composite evaluator failed: {e}") 2873 2874 return ExperimentItemResult( 2875 item=item, 2876 output=output, 2877 evaluations=evaluations, 2878 trace_id=trace_id, 2879 dataset_run_id=dataset_run_id, 2880 ) 2881 2882 def _create_experiment_run_name( 2883 self, *, name: Optional[str] = None, run_name: Optional[str] = None 2884 ) -> str: 2885 if run_name: 2886 return run_name 2887 2888 iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z") 2889 2890 return f"{name} - {iso_timestamp}" 2891 2892 def run_batched_evaluation( 2893 self, 2894 *, 2895 scope: Literal["traces", "observations"], 2896 mapper: MapperFunction, 2897 filter: Optional[str] = None, 2898 fetch_batch_size: int = 50, 2899 fetch_trace_fields: Optional[str] = None, 2900 max_items: Optional[int] = None, 2901 max_retries: int = 3, 2902 evaluators: List[EvaluatorFunction], 2903 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2904 max_concurrency: int = 5, 2905 metadata: Optional[Dict[str, Any]] = None, 2906 _add_observation_scores_to_trace: bool = False, 2907 _additional_trace_tags: Optional[List[str]] = None, 2908 resume_from: Optional[BatchEvaluationResumeToken] = None, 2909 verbose: bool = False, 2910 ) -> BatchEvaluationResult: 2911 """Fetch traces or observations and run evaluations on each item. 2912 2913 This method provides a powerful way to evaluate existing data in Langfuse at scale. 2914 It fetches items based on filters, transforms them using a mapper function, runs 2915 evaluators on each item, and creates scores that are linked back to the original 2916 entities. This is ideal for: 2917 2918 - Running evaluations on production traces after deployment 2919 - Backtesting new evaluation metrics on historical data 2920 - Batch scoring of observations for quality monitoring 2921 - Periodic evaluation runs on recent data 2922 2923 The method uses a streaming/pipeline approach to process items in batches, making 2924 it memory-efficient for large datasets. It includes comprehensive error handling, 2925 retry logic, and resume capability for long-running evaluations. 2926 2927 Args: 2928 scope: The type of items to evaluate. Must be one of: 2929 - "traces": Evaluate complete traces with all their observations 2930 - "observations": Evaluate individual observations (spans, generations, events) 2931 mapper: Function that transforms API response objects into evaluator inputs. 2932 Receives a trace/observation object and returns an EvaluatorInputs 2933 instance with input, output, expected_output, and metadata fields. 2934 Can be sync or async. 2935 evaluators: List of evaluation functions to run on each item. Each evaluator 2936 receives the mapped inputs and returns Evaluation object(s). Evaluator 2937 failures are logged but don't stop the batch evaluation. 2938 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 2939 - '{"tags": ["production"]}' 2940 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 2941 Default: None (fetches all items). 2942 fetch_batch_size: Number of items to fetch per API call and hold in memory. 2943 Larger values may be faster but use more memory. Default: 50. 2944 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 2945 max_items: Maximum total number of items to process. If None, processes all 2946 items matching the filter. Useful for testing or limiting evaluation runs. 2947 Default: None (process all). 2948 max_concurrency: Maximum number of items to evaluate concurrently. Controls 2949 parallelism and resource usage. Default: 5. 2950 composite_evaluator: Optional function that creates a composite score from 2951 item-level evaluations. Receives the original item and its evaluations, 2952 returns a single Evaluation. Useful for weighted averages or combined metrics. 2953 Default: None. 2954 metadata: Optional metadata dict to add to all created scores. Useful for 2955 tracking evaluation runs, versions, or other context. Default: None. 2956 max_retries: Maximum number of retry attempts for failed batch fetches. 2957 Uses exponential backoff (1s, 2s, 4s). Default: 3. 2958 verbose: If True, logs progress information to console. Useful for monitoring 2959 long-running evaluations. Default: False. 2960 resume_from: Optional resume token from a previous incomplete run. Allows 2961 continuing evaluation after interruption or failure. Default: None. 2962 2963 2964 Returns: 2965 BatchEvaluationResult containing: 2966 - total_items_fetched: Number of items fetched from API 2967 - total_items_processed: Number of items successfully evaluated 2968 - total_items_failed: Number of items that failed evaluation 2969 - total_scores_created: Scores created by item-level evaluators 2970 - total_composite_scores_created: Scores created by composite evaluator 2971 - total_evaluations_failed: Individual evaluator failures 2972 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 2973 - resume_token: Token for resuming if incomplete (None if completed) 2974 - completed: True if all items processed 2975 - duration_seconds: Total execution time 2976 - failed_item_ids: IDs of items that failed 2977 - error_summary: Error types and counts 2978 - has_more_items: True if max_items reached but more exist 2979 2980 Raises: 2981 ValueError: If invalid scope is provided. 2982 2983 Examples: 2984 Basic trace evaluation: 2985 ```python 2986 from langfuse import Langfuse, EvaluatorInputs, Evaluation 2987 2988 client = Langfuse() 2989 2990 # Define mapper to extract fields from traces 2991 def trace_mapper(trace): 2992 return EvaluatorInputs( 2993 input=trace.input, 2994 output=trace.output, 2995 expected_output=None, 2996 metadata={"trace_id": trace.id} 2997 ) 2998 2999 # Define evaluator 3000 def length_evaluator(*, input, output, expected_output, metadata): 3001 return Evaluation( 3002 name="output_length", 3003 value=len(output) if output else 0 3004 ) 3005 3006 # Run batch evaluation 3007 result = client.run_batched_evaluation( 3008 scope="traces", 3009 mapper=trace_mapper, 3010 evaluators=[length_evaluator], 3011 filter='{"tags": ["production"]}', 3012 max_items=1000, 3013 verbose=True 3014 ) 3015 3016 print(f"Processed {result.total_items_processed} traces") 3017 print(f"Created {result.total_scores_created} scores") 3018 ``` 3019 3020 Evaluation with composite scorer: 3021 ```python 3022 def accuracy_evaluator(*, input, output, expected_output, metadata): 3023 # ... evaluation logic 3024 return Evaluation(name="accuracy", value=0.85) 3025 3026 def relevance_evaluator(*, input, output, expected_output, metadata): 3027 # ... evaluation logic 3028 return Evaluation(name="relevance", value=0.92) 3029 3030 def composite_evaluator(*, item, evaluations): 3031 # Weighted average of evaluations 3032 weights = {"accuracy": 0.6, "relevance": 0.4} 3033 total = sum( 3034 e.value * weights.get(e.name, 0) 3035 for e in evaluations 3036 if isinstance(e.value, (int, float)) 3037 ) 3038 return Evaluation( 3039 name="composite_score", 3040 value=total, 3041 comment=f"Weighted average of {len(evaluations)} metrics" 3042 ) 3043 3044 result = client.run_batched_evaluation( 3045 scope="traces", 3046 mapper=trace_mapper, 3047 evaluators=[accuracy_evaluator, relevance_evaluator], 3048 composite_evaluator=composite_evaluator, 3049 filter='{"user_id": "important_user"}', 3050 verbose=True 3051 ) 3052 ``` 3053 3054 Handling incomplete runs with resume: 3055 ```python 3056 # Initial run that may fail or timeout 3057 result = client.run_batched_evaluation( 3058 scope="observations", 3059 mapper=obs_mapper, 3060 evaluators=[my_evaluator], 3061 max_items=10000, 3062 verbose=True 3063 ) 3064 3065 # Check if incomplete 3066 if not result.completed and result.resume_token: 3067 print(f"Processed {result.resume_token.items_processed} items before interruption") 3068 3069 # Resume from where it left off 3070 result = client.run_batched_evaluation( 3071 scope="observations", 3072 mapper=obs_mapper, 3073 evaluators=[my_evaluator], 3074 resume_from=result.resume_token, 3075 verbose=True 3076 ) 3077 3078 print(f"Total items processed: {result.total_items_processed}") 3079 ``` 3080 3081 Monitoring evaluator performance: 3082 ```python 3083 result = client.run_batched_evaluation(...) 3084 3085 for stats in result.evaluator_stats: 3086 success_rate = stats.successful_runs / stats.total_runs 3087 print(f"{stats.name}:") 3088 print(f" Success rate: {success_rate:.1%}") 3089 print(f" Scores created: {stats.total_scores_created}") 3090 3091 if stats.failed_runs > 0: 3092 print(f" â ī¸ Failed {stats.failed_runs} times") 3093 ``` 3094 3095 Note: 3096 - Evaluator failures are logged but don't stop the batch evaluation 3097 - Individual item failures are tracked but don't stop processing 3098 - Fetch failures are retried with exponential backoff 3099 - All scores are automatically flushed to Langfuse at the end 3100 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3101 """ 3102 runner = BatchEvaluationRunner(self) 3103 3104 return cast( 3105 BatchEvaluationResult, 3106 run_async_safely( 3107 runner.run_async( 3108 scope=scope, 3109 mapper=mapper, 3110 evaluators=evaluators, 3111 filter=filter, 3112 fetch_batch_size=fetch_batch_size, 3113 fetch_trace_fields=fetch_trace_fields, 3114 max_items=max_items, 3115 max_concurrency=max_concurrency, 3116 composite_evaluator=composite_evaluator, 3117 metadata=metadata, 3118 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3119 _additional_trace_tags=_additional_trace_tags, 3120 max_retries=max_retries, 3121 verbose=verbose, 3122 resume_from=resume_from, 3123 ) 3124 ), 3125 ) 3126 3127 def auth_check(self) -> bool: 3128 """Check if the provided credentials (public and secret key) are valid. 3129 3130 Raises: 3131 Exception: If no projects were found for the provided credentials. 3132 3133 Note: 3134 This method is blocking. It is discouraged to use it in production code. 3135 """ 3136 try: 3137 projects = self.api.projects.get() 3138 langfuse_logger.debug( 3139 f"Auth check successful, found {len(projects.data)} projects" 3140 ) 3141 if len(projects.data) == 0: 3142 raise Exception( 3143 "Auth check failed, no project found for the keys provided." 3144 ) 3145 return True 3146 3147 except AttributeError as e: 3148 langfuse_logger.warning( 3149 f"Auth check failed: Client not properly initialized. Error: {e}" 3150 ) 3151 return False 3152 3153 except Error as e: 3154 handle_fern_exception(e) 3155 raise e 3156 3157 def create_dataset( 3158 self, 3159 *, 3160 name: str, 3161 description: Optional[str] = None, 3162 metadata: Optional[Any] = None, 3163 input_schema: Optional[Any] = None, 3164 expected_output_schema: Optional[Any] = None, 3165 ) -> Dataset: 3166 """Create a dataset with the given name on Langfuse. 3167 3168 Args: 3169 name: Name of the dataset to create. 3170 description: Description of the dataset. Defaults to None. 3171 metadata: Additional metadata. Defaults to None. 3172 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3173 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3174 3175 Returns: 3176 Dataset: The created dataset as returned by the Langfuse API. 3177 """ 3178 try: 3179 langfuse_logger.debug(f"Creating datasets {name}") 3180 3181 result = self.api.datasets.create( 3182 name=name, 3183 description=description, 3184 metadata=metadata, 3185 input_schema=input_schema, 3186 expected_output_schema=expected_output_schema, 3187 ) 3188 3189 return cast(Dataset, result) 3190 3191 except Error as e: 3192 handle_fern_exception(e) 3193 raise e 3194 3195 def create_dataset_item( 3196 self, 3197 *, 3198 dataset_name: str, 3199 input: Optional[Any] = None, 3200 expected_output: Optional[Any] = None, 3201 metadata: Optional[Any] = None, 3202 source_trace_id: Optional[str] = None, 3203 source_observation_id: Optional[str] = None, 3204 status: Optional[DatasetStatus] = None, 3205 id: Optional[str] = None, 3206 ) -> DatasetItem: 3207 """Create a dataset item. 3208 3209 Upserts if an item with id already exists. 3210 3211 Args: 3212 dataset_name: Name of the dataset in which the dataset item should be created. 3213 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3214 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3215 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3216 source_trace_id: Id of the source trace. Defaults to None. 3217 source_observation_id: Id of the source observation. Defaults to None. 3218 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3219 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3220 3221 Returns: 3222 DatasetItem: The created dataset item as returned by the Langfuse API. 3223 3224 Example: 3225 ```python 3226 from langfuse import Langfuse 3227 3228 langfuse = Langfuse() 3229 3230 # Uploading items to the Langfuse dataset named "capital_cities" 3231 langfuse.create_dataset_item( 3232 dataset_name="capital_cities", 3233 input={"input": {"country": "Italy"}}, 3234 expected_output={"expected_output": "Rome"}, 3235 metadata={"foo": "bar"} 3236 ) 3237 ``` 3238 """ 3239 try: 3240 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3241 3242 result = self.api.dataset_items.create( 3243 dataset_name=dataset_name, 3244 input=input, 3245 expected_output=expected_output, 3246 metadata=metadata, 3247 source_trace_id=source_trace_id, 3248 source_observation_id=source_observation_id, 3249 status=status, 3250 id=id, 3251 ) 3252 3253 return cast(DatasetItem, result) 3254 except Error as e: 3255 handle_fern_exception(e) 3256 raise e 3257 3258 def resolve_media_references( 3259 self, 3260 *, 3261 obj: Any, 3262 resolve_with: Literal["base64_data_uri"], 3263 max_depth: int = 10, 3264 content_fetch_timeout_seconds: int = 5, 3265 ) -> Any: 3266 """Replace media reference strings in an object with base64 data URIs. 3267 3268 This method recursively traverses an object (up to max_depth) looking for media reference strings 3269 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3270 the provided Langfuse client and replaces the reference string with a base64 data URI. 3271 3272 If fetching media content fails for a reference string, a warning is logged and the reference 3273 string is left unchanged. 3274 3275 Args: 3276 obj: The object to process. Can be a primitive value, array, or nested object. 3277 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3278 resolve_with: The representation of the media content to replace the media reference string with. 3279 Currently only "base64_data_uri" is supported. 3280 max_depth: int: The maximum depth to traverse the object. Default is 10. 3281 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3282 3283 Returns: 3284 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3285 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3286 3287 Example: 3288 obj = { 3289 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3290 "nested": { 3291 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3292 } 3293 } 3294 3295 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3296 3297 # Result: 3298 # { 3299 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3300 # "nested": { 3301 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3302 # } 3303 # } 3304 """ 3305 return LangfuseMedia.resolve_media_references( 3306 langfuse_client=self, 3307 obj=obj, 3308 resolve_with=resolve_with, 3309 max_depth=max_depth, 3310 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3311 ) 3312 3313 @overload 3314 def get_prompt( 3315 self, 3316 name: str, 3317 *, 3318 version: Optional[int] = None, 3319 label: Optional[str] = None, 3320 type: Literal["chat"], 3321 cache_ttl_seconds: Optional[int] = None, 3322 fallback: Optional[List[ChatMessageDict]] = None, 3323 max_retries: Optional[int] = None, 3324 fetch_timeout_seconds: Optional[int] = None, 3325 ) -> ChatPromptClient: ... 3326 3327 @overload 3328 def get_prompt( 3329 self, 3330 name: str, 3331 *, 3332 version: Optional[int] = None, 3333 label: Optional[str] = None, 3334 type: Literal["text"] = "text", 3335 cache_ttl_seconds: Optional[int] = None, 3336 fallback: Optional[str] = None, 3337 max_retries: Optional[int] = None, 3338 fetch_timeout_seconds: Optional[int] = None, 3339 ) -> TextPromptClient: ... 3340 3341 def get_prompt( 3342 self, 3343 name: str, 3344 *, 3345 version: Optional[int] = None, 3346 label: Optional[str] = None, 3347 type: Literal["chat", "text"] = "text", 3348 cache_ttl_seconds: Optional[int] = None, 3349 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3350 max_retries: Optional[int] = None, 3351 fetch_timeout_seconds: Optional[int] = None, 3352 ) -> PromptClient: 3353 """Get a prompt. 3354 3355 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3356 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3357 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3358 return the expired prompt as a fallback. 3359 3360 Args: 3361 name (str): The name of the prompt to retrieve. 3362 3363 Keyword Args: 3364 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3365 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3366 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3367 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3368 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3369 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3370 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3371 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3372 3373 Returns: 3374 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3375 - TextPromptClient, if type argument is 'text'. 3376 - ChatPromptClient, if type argument is 'chat'. 3377 3378 Raises: 3379 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3380 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3381 """ 3382 if self._resources is None: 3383 raise Error( 3384 "SDK is not correctly initialized. Check the init logs for more details." 3385 ) 3386 if version is not None and label is not None: 3387 raise ValueError("Cannot specify both version and label at the same time.") 3388 3389 if not name: 3390 raise ValueError("Prompt name cannot be empty.") 3391 3392 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3393 bounded_max_retries = self._get_bounded_max_retries( 3394 max_retries, default_max_retries=2, max_retries_upper_bound=4 3395 ) 3396 3397 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3398 cached_prompt = self._resources.prompt_cache.get(cache_key) 3399 3400 if cached_prompt is None or cache_ttl_seconds == 0: 3401 langfuse_logger.debug( 3402 f"Prompt '{cache_key}' not found in cache or caching disabled." 3403 ) 3404 try: 3405 return self._fetch_prompt_and_update_cache( 3406 name, 3407 version=version, 3408 label=label, 3409 ttl_seconds=cache_ttl_seconds, 3410 max_retries=bounded_max_retries, 3411 fetch_timeout_seconds=fetch_timeout_seconds, 3412 ) 3413 except Exception as e: 3414 if fallback: 3415 langfuse_logger.warning( 3416 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3417 ) 3418 3419 fallback_client_args: Dict[str, Any] = { 3420 "name": name, 3421 "prompt": fallback, 3422 "type": type, 3423 "version": version or 0, 3424 "config": {}, 3425 "labels": [label] if label else [], 3426 "tags": [], 3427 } 3428 3429 if type == "text": 3430 return TextPromptClient( 3431 prompt=Prompt_Text(**fallback_client_args), 3432 is_fallback=True, 3433 ) 3434 3435 if type == "chat": 3436 return ChatPromptClient( 3437 prompt=Prompt_Chat(**fallback_client_args), 3438 is_fallback=True, 3439 ) 3440 3441 raise e 3442 3443 if cached_prompt.is_expired(): 3444 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3445 try: 3446 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3447 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3448 3449 def refresh_task() -> None: 3450 self._fetch_prompt_and_update_cache( 3451 name, 3452 version=version, 3453 label=label, 3454 ttl_seconds=cache_ttl_seconds, 3455 max_retries=bounded_max_retries, 3456 fetch_timeout_seconds=fetch_timeout_seconds, 3457 ) 3458 3459 self._resources.prompt_cache.add_refresh_prompt_task( 3460 cache_key, 3461 refresh_task, 3462 ) 3463 langfuse_logger.debug( 3464 f"Returning stale prompt '{cache_key}' from cache." 3465 ) 3466 # return stale prompt 3467 return cached_prompt.value 3468 3469 except Exception as e: 3470 langfuse_logger.warning( 3471 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3472 ) 3473 # creation of refresh prompt task failed, return stale prompt 3474 return cached_prompt.value 3475 3476 return cached_prompt.value 3477 3478 def _fetch_prompt_and_update_cache( 3479 self, 3480 name: str, 3481 *, 3482 version: Optional[int] = None, 3483 label: Optional[str] = None, 3484 ttl_seconds: Optional[int] = None, 3485 max_retries: int, 3486 fetch_timeout_seconds: Optional[int], 3487 ) -> PromptClient: 3488 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3489 langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...") 3490 3491 try: 3492 3493 @backoff.on_exception( 3494 backoff.constant, Exception, max_tries=max_retries + 1, logger=None 3495 ) 3496 def fetch_prompts() -> Any: 3497 return self.api.prompts.get( 3498 self._url_encode(name), 3499 version=version, 3500 label=label, 3501 request_options={ 3502 "timeout_in_seconds": fetch_timeout_seconds, 3503 } 3504 if fetch_timeout_seconds is not None 3505 else None, 3506 ) 3507 3508 prompt_response = fetch_prompts() 3509 3510 prompt: PromptClient 3511 if prompt_response.type == "chat": 3512 prompt = ChatPromptClient(prompt_response) 3513 else: 3514 prompt = TextPromptClient(prompt_response) 3515 3516 if self._resources is not None: 3517 self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds) 3518 3519 return prompt 3520 3521 except NotFoundError as not_found_error: 3522 langfuse_logger.warning( 3523 f"Prompt '{cache_key}' not found during refresh, evicting from cache." 3524 ) 3525 if self._resources is not None: 3526 self._resources.prompt_cache.delete(cache_key) 3527 raise not_found_error 3528 3529 except Exception as e: 3530 langfuse_logger.error( 3531 f"Error while fetching prompt '{cache_key}': {str(e)}" 3532 ) 3533 raise e 3534 3535 def _get_bounded_max_retries( 3536 self, 3537 max_retries: Optional[int], 3538 *, 3539 default_max_retries: int = 2, 3540 max_retries_upper_bound: int = 4, 3541 ) -> int: 3542 if max_retries is None: 3543 return default_max_retries 3544 3545 bounded_max_retries = min( 3546 max(max_retries, 0), 3547 max_retries_upper_bound, 3548 ) 3549 3550 return bounded_max_retries 3551 3552 @overload 3553 def create_prompt( 3554 self, 3555 *, 3556 name: str, 3557 prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]], 3558 labels: List[str] = [], 3559 tags: Optional[List[str]] = None, 3560 type: Optional[Literal["chat"]], 3561 config: Optional[Any] = None, 3562 commit_message: Optional[str] = None, 3563 ) -> ChatPromptClient: ... 3564 3565 @overload 3566 def create_prompt( 3567 self, 3568 *, 3569 name: str, 3570 prompt: str, 3571 labels: List[str] = [], 3572 tags: Optional[List[str]] = None, 3573 type: Optional[Literal["text"]] = "text", 3574 config: Optional[Any] = None, 3575 commit_message: Optional[str] = None, 3576 ) -> TextPromptClient: ... 3577 3578 def create_prompt( 3579 self, 3580 *, 3581 name: str, 3582 prompt: Union[ 3583 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3584 ], 3585 labels: List[str] = [], 3586 tags: Optional[List[str]] = None, 3587 type: Optional[Literal["chat", "text"]] = "text", 3588 config: Optional[Any] = None, 3589 commit_message: Optional[str] = None, 3590 ) -> PromptClient: 3591 """Create a new prompt in Langfuse. 3592 3593 Keyword Args: 3594 name : The name of the prompt to be created. 3595 prompt : The content of the prompt to be created. 3596 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3597 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3598 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3599 config: Additional structured data to be saved with the prompt. Defaults to None. 3600 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3601 commit_message: Optional string describing the change. 3602 3603 Returns: 3604 TextPromptClient: The prompt if type argument is 'text'. 3605 ChatPromptClient: The prompt if type argument is 'chat'. 3606 """ 3607 try: 3608 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3609 3610 if type == "chat": 3611 if not isinstance(prompt, list): 3612 raise ValueError( 3613 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3614 ) 3615 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3616 CreateChatPromptRequest( 3617 name=name, 3618 prompt=cast(Any, prompt), 3619 labels=labels, 3620 tags=tags, 3621 config=config or {}, 3622 commit_message=commit_message, 3623 type=CreateChatPromptType.CHAT, 3624 ) 3625 ) 3626 server_prompt = self.api.prompts.create(request=request) 3627 3628 if self._resources is not None: 3629 self._resources.prompt_cache.invalidate(name) 3630 3631 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3632 3633 if not isinstance(prompt, str): 3634 raise ValueError("For 'text' type, 'prompt' must be a string.") 3635 3636 request = CreateTextPromptRequest( 3637 name=name, 3638 prompt=prompt, 3639 labels=labels, 3640 tags=tags, 3641 config=config or {}, 3642 commit_message=commit_message, 3643 ) 3644 3645 server_prompt = self.api.prompts.create(request=request) 3646 3647 if self._resources is not None: 3648 self._resources.prompt_cache.invalidate(name) 3649 3650 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3651 3652 except Error as e: 3653 handle_fern_exception(e) 3654 raise e 3655 3656 def update_prompt( 3657 self, 3658 *, 3659 name: str, 3660 version: int, 3661 new_labels: List[str] = [], 3662 ) -> Any: 3663 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3664 3665 Args: 3666 name (str): The name of the prompt to update. 3667 version (int): The version number of the prompt to update. 3668 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3669 3670 Returns: 3671 Prompt: The updated prompt from the Langfuse API. 3672 3673 """ 3674 updated_prompt = self.api.prompt_version.update( 3675 name=self._url_encode(name), 3676 version=version, 3677 new_labels=new_labels, 3678 ) 3679 3680 if self._resources is not None: 3681 self._resources.prompt_cache.invalidate(name) 3682 3683 return updated_prompt 3684 3685 def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str: 3686 # httpx âĨ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare 3687 # â%â, â?â, â#â, â|â, âĻ in query/path parts). Re-quoting here would 3688 # double-encode, so we skip when the value is about to be sent straight 3689 # to httpx (`is_url_param=True`) and the installed version is âĨ 0.28. 3690 if is_url_param and Version(httpx.__version__) >= Version("0.28.0"): 3691 return url 3692 3693 # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping 3694 # we need add safe="" to force escaping of slashes 3695 # This is necessary for prompts in prompt folders 3696 return urllib.parse.quote(url, safe="") 3697 3698 def clear_prompt_cache(self) -> None: 3699 """Clear the entire prompt cache, removing all cached prompts. 3700 3701 This method is useful when you want to force a complete refresh of all 3702 cached prompts, for example after major updates or when you need to 3703 ensure the latest versions are fetched from the server. 3704 """ 3705 if self._resources is not None: 3706 self._resources.prompt_cache.clear()
Main client for Langfuse tracing and platform features.
This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.
The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.
Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.
Attributes:
- api: Synchronous API client for Langfuse backend communication
- async_api: Asynchronous API client for Langfuse backend communication
- _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
- public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
- secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
- base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
- host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
- timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
- httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
- debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
- tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
- flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
- flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
- environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
- release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
- media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
- sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
- mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use
should_export_spaninstead. Equivalent behavior:from langfuse.span_filter import is_default_export_span blocked = {"sqlite", "requests"} should_export_span = lambda span: ( is_default_export_span(span) and ( span.instrumentation_scope is None or span.instrumentation_scope.name not in blocked ) )should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with
gen_ai.*attributes, and known LLM instrumentation scopes).- additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
- tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
Example:
from langfuse.otel import Langfuse # Initialize the client (reads from env vars if not provided) langfuse = Langfuse( public_key="your-public-key", secret_key="your-secret-key", host="https://cloud.langfuse.com", # Optional, default shown ) # Create a trace span with langfuse.start_as_current_observation(name="process-query") as span: # Your application code here # Create a nested generation span for an LLM call with span.start_as_current_generation( name="generate-response", model="gpt-4", input={"query": "Tell me about AI"}, model_parameters={"temperature": 0.7, "max_tokens": 500} ) as generation: # Generate response here response = "AI is a field of computer science..." generation.update( output=response, usage_details={"prompt_tokens": 10, "completion_tokens": 50}, cost_details={"total_cost": 0.0023} ) # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) generation.score(name="relevance", value=0.95, data_type="NUMERIC")
224 def __init__( 225 self, 226 *, 227 public_key: Optional[str] = None, 228 secret_key: Optional[str] = None, 229 base_url: Optional[str] = None, 230 host: Optional[str] = None, 231 timeout: Optional[int] = None, 232 httpx_client: Optional[httpx.Client] = None, 233 debug: bool = False, 234 tracing_enabled: Optional[bool] = True, 235 flush_at: Optional[int] = None, 236 flush_interval: Optional[float] = None, 237 environment: Optional[str] = None, 238 release: Optional[str] = None, 239 media_upload_thread_count: Optional[int] = None, 240 sample_rate: Optional[float] = None, 241 mask: Optional[MaskFunction] = None, 242 blocked_instrumentation_scopes: Optional[List[str]] = None, 243 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 244 additional_headers: Optional[Dict[str, str]] = None, 245 tracer_provider: Optional[TracerProvider] = None, 246 ): 247 self._base_url = ( 248 base_url 249 or os.environ.get(LANGFUSE_BASE_URL) 250 or host 251 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 252 ) 253 self._environment = environment or cast( 254 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 255 ) 256 self._project_id: Optional[str] = None 257 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 258 if not 0.0 <= sample_rate <= 1.0: 259 raise ValueError( 260 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 261 ) 262 263 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 264 265 self._tracing_enabled = ( 266 tracing_enabled 267 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 268 ) 269 if not self._tracing_enabled: 270 langfuse_logger.info( 271 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 272 ) 273 274 debug = ( 275 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 276 ) 277 if debug: 278 logging.basicConfig( 279 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 280 ) 281 langfuse_logger.setLevel(logging.DEBUG) 282 283 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 284 if public_key is None: 285 langfuse_logger.warning( 286 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 287 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 288 ) 289 self._otel_tracer = otel_trace_api.NoOpTracer() 290 return 291 292 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 293 if secret_key is None: 294 langfuse_logger.warning( 295 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 296 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 297 ) 298 self._otel_tracer = otel_trace_api.NoOpTracer() 299 return 300 301 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 302 langfuse_logger.warning( 303 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 304 ) 305 306 if blocked_instrumentation_scopes is not None: 307 warnings.warn( 308 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 309 "Use `should_export_span` instead. Example: " 310 "from langfuse.span_filter import is_default_export_span; " 311 'blocked={"scope"}; should_export_span=lambda span: ' 312 "is_default_export_span(span) and (span.instrumentation_scope is None or " 313 "span.instrumentation_scope.name not in blocked).", 314 DeprecationWarning, 315 stacklevel=2, 316 ) 317 318 # Initialize api and tracer if requirements are met 319 self._resources = LangfuseResourceManager( 320 public_key=public_key, 321 secret_key=secret_key, 322 base_url=self._base_url, 323 timeout=timeout, 324 environment=self._environment, 325 release=release, 326 flush_at=flush_at, 327 flush_interval=flush_interval, 328 httpx_client=httpx_client, 329 media_upload_thread_count=media_upload_thread_count, 330 sample_rate=sample_rate, 331 mask=mask, 332 tracing_enabled=self._tracing_enabled, 333 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 334 should_export_span=should_export_span, 335 additional_headers=additional_headers, 336 tracer_provider=tracer_provider, 337 ) 338 self._mask = self._resources.mask 339 340 self._otel_tracer = ( 341 self._resources.tracer 342 if self._tracing_enabled and self._resources.tracer is not None 343 else otel_trace_api.NoOpTracer() 344 ) 345 self.api = self._resources.api 346 self.async_api = self._resources.async_api
495 def start_observation( 496 self, 497 *, 498 trace_context: Optional[TraceContext] = None, 499 name: str, 500 as_type: ObservationTypeLiteralNoEvent = "span", 501 input: Optional[Any] = None, 502 output: Optional[Any] = None, 503 metadata: Optional[Any] = None, 504 version: Optional[str] = None, 505 level: Optional[SpanLevel] = None, 506 status_message: Optional[str] = None, 507 completion_start_time: Optional[datetime] = None, 508 model: Optional[str] = None, 509 model_parameters: Optional[Dict[str, MapValue]] = None, 510 usage_details: Optional[Dict[str, int]] = None, 511 cost_details: Optional[Dict[str, float]] = None, 512 prompt: Optional[PromptClient] = None, 513 ) -> Union[ 514 LangfuseSpan, 515 LangfuseGeneration, 516 LangfuseAgent, 517 LangfuseTool, 518 LangfuseChain, 519 LangfuseRetriever, 520 LangfuseEvaluator, 521 LangfuseEmbedding, 522 LangfuseGuardrail, 523 ]: 524 """Create a new observation of the specified type. 525 526 This method creates a new observation but does not set it as the current span in the 527 context. To create and use an observation within a context, use start_as_current_observation(). 528 529 Args: 530 trace_context: Optional context for connecting to an existing trace 531 name: Name of the observation 532 as_type: Type of observation to create (defaults to "span") 533 input: Input data for the operation 534 output: Output data from the operation 535 metadata: Additional metadata to associate with the observation 536 version: Version identifier for the code or component 537 level: Importance level of the observation 538 status_message: Optional status message for the observation 539 completion_start_time: When the model started generating (for generation types) 540 model: Name/identifier of the AI model used (for generation types) 541 model_parameters: Parameters used for the model (for generation types) 542 usage_details: Token usage information (for generation types) 543 cost_details: Cost information (for generation types) 544 prompt: Associated prompt template (for generation types) 545 546 Returns: 547 An observation object of the appropriate type that must be ended with .end() 548 """ 549 if trace_context: 550 trace_id = trace_context.get("trace_id", None) 551 parent_span_id = trace_context.get("parent_span_id", None) 552 553 if trace_id: 554 remote_parent_span = self._create_remote_parent_span( 555 trace_id=trace_id, parent_span_id=parent_span_id 556 ) 557 558 with otel_trace_api.use_span( 559 cast(otel_trace_api.Span, remote_parent_span) 560 ): 561 otel_span = self._otel_tracer.start_span(name=name) 562 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 563 564 return self._create_observation_from_otel_span( 565 otel_span=otel_span, 566 as_type=as_type, 567 input=input, 568 output=output, 569 metadata=metadata, 570 version=version, 571 level=level, 572 status_message=status_message, 573 completion_start_time=completion_start_time, 574 model=model, 575 model_parameters=model_parameters, 576 usage_details=usage_details, 577 cost_details=cost_details, 578 prompt=prompt, 579 ) 580 581 otel_span = self._otel_tracer.start_span(name=name) 582 583 return self._create_observation_from_otel_span( 584 otel_span=otel_span, 585 as_type=as_type, 586 input=input, 587 output=output, 588 metadata=metadata, 589 version=version, 590 level=level, 591 status_message=status_message, 592 completion_start_time=completion_start_time, 593 model=model, 594 model_parameters=model_parameters, 595 usage_details=usage_details, 596 cost_details=cost_details, 597 prompt=prompt, 598 )
Create a new observation of the specified type.
This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation
- status_message: Optional status message for the observation
- completion_start_time: When the model started generating (for generation types)
- model: Name/identifier of the AI model used (for generation types)
- model_parameters: Parameters used for the model (for generation types)
- usage_details: Token usage information (for generation types)
- cost_details: Cost information (for generation types)
- prompt: Associated prompt template (for generation types)
Returns:
An observation object of the appropriate type that must be ended with .end()
826 def start_as_current_observation( 827 self, 828 *, 829 trace_context: Optional[TraceContext] = None, 830 name: str, 831 as_type: ObservationTypeLiteralNoEvent = "span", 832 input: Optional[Any] = None, 833 output: Optional[Any] = None, 834 metadata: Optional[Any] = None, 835 version: Optional[str] = None, 836 level: Optional[SpanLevel] = None, 837 status_message: Optional[str] = None, 838 completion_start_time: Optional[datetime] = None, 839 model: Optional[str] = None, 840 model_parameters: Optional[Dict[str, MapValue]] = None, 841 usage_details: Optional[Dict[str, int]] = None, 842 cost_details: Optional[Dict[str, float]] = None, 843 prompt: Optional[PromptClient] = None, 844 end_on_exit: Optional[bool] = None, 845 ) -> Union[ 846 _AgnosticContextManager[LangfuseGeneration], 847 _AgnosticContextManager[LangfuseSpan], 848 _AgnosticContextManager[LangfuseAgent], 849 _AgnosticContextManager[LangfuseTool], 850 _AgnosticContextManager[LangfuseChain], 851 _AgnosticContextManager[LangfuseRetriever], 852 _AgnosticContextManager[LangfuseEvaluator], 853 _AgnosticContextManager[LangfuseEmbedding], 854 _AgnosticContextManager[LangfuseGuardrail], 855 ]: 856 """Create a new observation and set it as the current span in a context manager. 857 858 This method creates a new observation of the specified type and sets it as the 859 current span within a context manager. Use this method with a 'with' statement to 860 automatically handle the observation lifecycle within a code block. 861 862 The created observation will be the child of the current span in the context. 863 864 Args: 865 trace_context: Optional context for connecting to an existing trace 866 name: Name of the observation (e.g., function or operation name) 867 as_type: Type of observation to create (defaults to "span") 868 input: Input data for the operation (can be any JSON-serializable object) 869 output: Output data from the operation (can be any JSON-serializable object) 870 metadata: Additional metadata to associate with the observation 871 version: Version identifier for the code or component 872 level: Importance level of the observation (info, warning, error) 873 status_message: Optional status message for the observation 874 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 875 876 The following parameters are available when as_type is: "generation" or "embedding". 877 completion_start_time: When the model started generating the response 878 model: Name/identifier of the AI model used (e.g., "gpt-4") 879 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 880 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 881 cost_details: Cost information for the model call 882 prompt: Associated prompt template from Langfuse prompt management 883 884 Returns: 885 A context manager that yields the appropriate observation type based on as_type 886 887 Example: 888 ```python 889 # Create a span 890 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 891 # Do work 892 result = process_data() 893 span.update(output=result) 894 895 # Create a child span automatically 896 with span.start_as_current_observation(name="sub-operation") as child_span: 897 # Do sub-operation work 898 child_span.update(output="sub-result") 899 900 # Create a tool observation 901 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 902 # Do tool work 903 results = search_web(query) 904 tool.update(output=results) 905 906 # Create a generation observation 907 with langfuse.start_as_current_observation( 908 name="answer-generation", 909 as_type="generation", 910 model="gpt-4" 911 ) as generation: 912 # Generate answer 913 response = llm.generate(...) 914 generation.update(output=response) 915 ``` 916 """ 917 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 918 if trace_context: 919 trace_id = trace_context.get("trace_id", None) 920 parent_span_id = trace_context.get("parent_span_id", None) 921 922 if trace_id: 923 remote_parent_span = self._create_remote_parent_span( 924 trace_id=trace_id, parent_span_id=parent_span_id 925 ) 926 927 return cast( 928 Union[ 929 _AgnosticContextManager[LangfuseGeneration], 930 _AgnosticContextManager[LangfuseEmbedding], 931 ], 932 self._create_span_with_parent_context( 933 as_type=as_type, 934 name=name, 935 remote_parent_span=remote_parent_span, 936 parent=None, 937 end_on_exit=end_on_exit, 938 input=input, 939 output=output, 940 metadata=metadata, 941 version=version, 942 level=level, 943 status_message=status_message, 944 completion_start_time=completion_start_time, 945 model=model, 946 model_parameters=model_parameters, 947 usage_details=usage_details, 948 cost_details=cost_details, 949 prompt=prompt, 950 ), 951 ) 952 953 return cast( 954 Union[ 955 _AgnosticContextManager[LangfuseGeneration], 956 _AgnosticContextManager[LangfuseEmbedding], 957 ], 958 self._start_as_current_otel_span_with_processed_media( 959 as_type=as_type, 960 name=name, 961 end_on_exit=end_on_exit, 962 input=input, 963 output=output, 964 metadata=metadata, 965 version=version, 966 level=level, 967 status_message=status_message, 968 completion_start_time=completion_start_time, 969 model=model, 970 model_parameters=model_parameters, 971 usage_details=usage_details, 972 cost_details=cost_details, 973 prompt=prompt, 974 ), 975 ) 976 977 if as_type in get_observation_types_list(ObservationTypeSpanLike): 978 if trace_context: 979 trace_id = trace_context.get("trace_id", None) 980 parent_span_id = trace_context.get("parent_span_id", None) 981 982 if trace_id: 983 remote_parent_span = self._create_remote_parent_span( 984 trace_id=trace_id, parent_span_id=parent_span_id 985 ) 986 987 return cast( 988 Union[ 989 _AgnosticContextManager[LangfuseSpan], 990 _AgnosticContextManager[LangfuseAgent], 991 _AgnosticContextManager[LangfuseTool], 992 _AgnosticContextManager[LangfuseChain], 993 _AgnosticContextManager[LangfuseRetriever], 994 _AgnosticContextManager[LangfuseEvaluator], 995 _AgnosticContextManager[LangfuseGuardrail], 996 ], 997 self._create_span_with_parent_context( 998 as_type=as_type, 999 name=name, 1000 remote_parent_span=remote_parent_span, 1001 parent=None, 1002 end_on_exit=end_on_exit, 1003 input=input, 1004 output=output, 1005 metadata=metadata, 1006 version=version, 1007 level=level, 1008 status_message=status_message, 1009 ), 1010 ) 1011 1012 return cast( 1013 Union[ 1014 _AgnosticContextManager[LangfuseSpan], 1015 _AgnosticContextManager[LangfuseAgent], 1016 _AgnosticContextManager[LangfuseTool], 1017 _AgnosticContextManager[LangfuseChain], 1018 _AgnosticContextManager[LangfuseRetriever], 1019 _AgnosticContextManager[LangfuseEvaluator], 1020 _AgnosticContextManager[LangfuseGuardrail], 1021 ], 1022 self._start_as_current_otel_span_with_processed_media( 1023 as_type=as_type, 1024 name=name, 1025 end_on_exit=end_on_exit, 1026 input=input, 1027 output=output, 1028 metadata=metadata, 1029 version=version, 1030 level=level, 1031 status_message=status_message, 1032 ), 1033 ) 1034 1035 # This should never be reached since all valid types are handled above 1036 langfuse_logger.warning( 1037 f"Unknown observation type: {as_type}, falling back to span" 1038 ) 1039 return self._start_as_current_otel_span_with_processed_media( 1040 as_type="span", 1041 name=name, 1042 end_on_exit=end_on_exit, 1043 input=input, 1044 output=output, 1045 metadata=metadata, 1046 version=version, 1047 level=level, 1048 status_message=status_message, 1049 )
Create a new observation and set it as the current span in a context manager.
This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.
The created observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation (e.g., function or operation name)
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation (info, warning, error)
- status_message: Optional status message for the observation
- end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
- The following parameters are available when as_type is: "generation" or "embedding".
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Returns:
A context manager that yields the appropriate observation type based on as_type
Example:
# Create a span with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: # Do work result = process_data() span.update(output=result) # Create a child span automatically with span.start_as_current_observation(name="sub-operation") as child_span: # Do sub-operation work child_span.update(output="sub-result") # Create a tool observation with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: # Do tool work results = search_web(query) tool.update(output=results) # Create a generation observation with langfuse.start_as_current_observation( name="answer-generation", as_type="generation", model="gpt-4" ) as generation: # Generate answer response = llm.generate(...) generation.update(output=response)
1210 def update_current_generation( 1211 self, 1212 *, 1213 name: Optional[str] = None, 1214 input: Optional[Any] = None, 1215 output: Optional[Any] = None, 1216 metadata: Optional[Any] = None, 1217 version: Optional[str] = None, 1218 level: Optional[SpanLevel] = None, 1219 status_message: Optional[str] = None, 1220 completion_start_time: Optional[datetime] = None, 1221 model: Optional[str] = None, 1222 model_parameters: Optional[Dict[str, MapValue]] = None, 1223 usage_details: Optional[Dict[str, int]] = None, 1224 cost_details: Optional[Dict[str, float]] = None, 1225 prompt: Optional[PromptClient] = None, 1226 ) -> None: 1227 """Update the current active generation span with new information. 1228 1229 This method updates the current generation span in the active context with 1230 additional information. It's useful for adding output, usage stats, or other 1231 details that become available during or after model generation. 1232 1233 Args: 1234 name: The generation name 1235 input: Updated input data for the model 1236 output: Output from the model (e.g., completions) 1237 metadata: Additional metadata to associate with the generation 1238 version: Version identifier for the model or component 1239 level: Importance level of the generation (info, warning, error) 1240 status_message: Optional status message for the generation 1241 completion_start_time: When the model started generating the response 1242 model: Name/identifier of the AI model used (e.g., "gpt-4") 1243 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1244 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1245 cost_details: Cost information for the model call 1246 prompt: Associated prompt template from Langfuse prompt management 1247 1248 Example: 1249 ```python 1250 with langfuse.start_as_current_generation(name="answer-query") as generation: 1251 # Initial setup and API call 1252 response = llm.generate(...) 1253 1254 # Update with results that weren't available at creation time 1255 langfuse.update_current_generation( 1256 output=response.text, 1257 usage_details={ 1258 "prompt_tokens": response.usage.prompt_tokens, 1259 "completion_tokens": response.usage.completion_tokens 1260 } 1261 ) 1262 ``` 1263 """ 1264 if not self._tracing_enabled: 1265 langfuse_logger.debug( 1266 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1267 ) 1268 return 1269 1270 current_otel_span = self._get_current_otel_span() 1271 1272 if current_otel_span is not None: 1273 generation = LangfuseGeneration( 1274 otel_span=current_otel_span, langfuse_client=self 1275 ) 1276 1277 if name: 1278 current_otel_span.update_name(name) 1279 1280 generation.update( 1281 input=input, 1282 output=output, 1283 metadata=metadata, 1284 version=version, 1285 level=level, 1286 status_message=status_message, 1287 completion_start_time=completion_start_time, 1288 model=model, 1289 model_parameters=model_parameters, 1290 usage_details=usage_details, 1291 cost_details=cost_details, 1292 prompt=prompt, 1293 )
Update the current active generation span with new information.
This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.
Arguments:
- name: The generation name
- input: Updated input data for the model
- output: Output from the model (e.g., completions)
- metadata: Additional metadata to associate with the generation
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Initial setup and API call response = llm.generate(...) # Update with results that weren't available at creation time langfuse.update_current_generation( output=response.text, usage_details={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens } )
1295 def update_current_span( 1296 self, 1297 *, 1298 name: Optional[str] = None, 1299 input: Optional[Any] = None, 1300 output: Optional[Any] = None, 1301 metadata: Optional[Any] = None, 1302 version: Optional[str] = None, 1303 level: Optional[SpanLevel] = None, 1304 status_message: Optional[str] = None, 1305 ) -> None: 1306 """Update the current active span with new information. 1307 1308 This method updates the current span in the active context with 1309 additional information. It's useful for adding outputs or metadata 1310 that become available during execution. 1311 1312 Args: 1313 name: The span name 1314 input: Updated input data for the operation 1315 output: Output data from the operation 1316 metadata: Additional metadata to associate with the span 1317 version: Version identifier for the code or component 1318 level: Importance level of the span (info, warning, error) 1319 status_message: Optional status message for the span 1320 1321 Example: 1322 ```python 1323 with langfuse.start_as_current_observation(name="process-data") as span: 1324 # Initial processing 1325 result = process_first_part() 1326 1327 # Update with intermediate results 1328 langfuse.update_current_span(metadata={"intermediate_result": result}) 1329 1330 # Continue processing 1331 final_result = process_second_part(result) 1332 1333 # Final update 1334 langfuse.update_current_span(output=final_result) 1335 ``` 1336 """ 1337 if not self._tracing_enabled: 1338 langfuse_logger.debug( 1339 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1340 ) 1341 return 1342 1343 current_otel_span = self._get_current_otel_span() 1344 1345 if current_otel_span is not None: 1346 span = LangfuseSpan( 1347 otel_span=current_otel_span, 1348 langfuse_client=self, 1349 environment=self._environment, 1350 ) 1351 1352 if name: 1353 current_otel_span.update_name(name) 1354 1355 span.update( 1356 input=input, 1357 output=output, 1358 metadata=metadata, 1359 version=version, 1360 level=level, 1361 status_message=status_message, 1362 )
Update the current active span with new information.
This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.
Arguments:
- name: The span name
- input: Updated input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span: # Initial processing result = process_first_part() # Update with intermediate results langfuse.update_current_span(metadata={"intermediate_result": result}) # Continue processing final_result = process_second_part(result) # Final update langfuse.update_current_span(output=final_result)
1364 @deprecated( 1365 "Trace-level input/output is deprecated. " 1366 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1367 "This method will be removed in a future major version." 1368 ) 1369 def set_current_trace_io( 1370 self, 1371 *, 1372 input: Optional[Any] = None, 1373 output: Optional[Any] = None, 1374 ) -> None: 1375 """Set trace-level input and output for the current span's trace. 1376 1377 .. deprecated:: 1378 This is a legacy method for backward compatibility with Langfuse platform 1379 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1380 evaluators). It will be removed in a future major version. 1381 1382 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1383 use :meth:`propagate_attributes` instead. 1384 1385 Args: 1386 input: Input data to associate with the trace. 1387 output: Output data to associate with the trace. 1388 """ 1389 if not self._tracing_enabled: 1390 langfuse_logger.debug( 1391 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1392 ) 1393 return 1394 1395 current_otel_span = self._get_current_otel_span() 1396 1397 if current_otel_span is not None and current_otel_span.is_recording(): 1398 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1399 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1400 ) 1401 # We need to preserve the class to keep the correct observation type 1402 span_class = self._get_span_class(existing_observation_type) 1403 span = span_class( 1404 otel_span=current_otel_span, 1405 langfuse_client=self, 1406 environment=self._environment, 1407 ) 1408 1409 span.set_trace_io( 1410 input=input, 1411 output=output, 1412 )
Set trace-level input and output for the current span's trace.
Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.
For setting other trace attributes (user_id, session_id, metadata, tags, version),
use propagate_attributes() instead.
Arguments:
- input: Input data to associate with the trace.
- output: Output data to associate with the trace.
1414 def set_current_trace_as_public(self) -> None: 1415 """Make the current trace publicly accessible via its URL. 1416 1417 When a trace is published, anyone with the trace link can view the full trace 1418 without needing to be logged in to Langfuse. This action cannot be undone 1419 programmatically - once published, the entire trace becomes public. 1420 1421 This is a convenience method that publishes the trace from the currently 1422 active span context. Use this when you want to make a trace public from 1423 within a traced function without needing direct access to the span object. 1424 """ 1425 if not self._tracing_enabled: 1426 langfuse_logger.debug( 1427 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1428 ) 1429 return 1430 1431 current_otel_span = self._get_current_otel_span() 1432 1433 if current_otel_span is not None and current_otel_span.is_recording(): 1434 existing_observation_type = current_otel_span.attributes.get( # type: ignore[attr-defined] 1435 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1436 ) 1437 # We need to preserve the class to keep the correct observation type 1438 span_class = self._get_span_class(existing_observation_type) 1439 span = span_class( 1440 otel_span=current_otel_span, 1441 langfuse_client=self, 1442 environment=self._environment, 1443 ) 1444 1445 span.set_trace_as_public()
Make the current trace publicly accessible via its URL.
When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.
This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.
1447 def create_event( 1448 self, 1449 *, 1450 trace_context: Optional[TraceContext] = None, 1451 name: str, 1452 input: Optional[Any] = None, 1453 output: Optional[Any] = None, 1454 metadata: Optional[Any] = None, 1455 version: Optional[str] = None, 1456 level: Optional[SpanLevel] = None, 1457 status_message: Optional[str] = None, 1458 ) -> LangfuseEvent: 1459 """Create a new Langfuse observation of type 'EVENT'. 1460 1461 The created Langfuse Event observation will be the child of the current span in the context. 1462 1463 Args: 1464 trace_context: Optional context for connecting to an existing trace 1465 name: Name of the span (e.g., function or operation name) 1466 input: Input data for the operation (can be any JSON-serializable object) 1467 output: Output data from the operation (can be any JSON-serializable object) 1468 metadata: Additional metadata to associate with the span 1469 version: Version identifier for the code or component 1470 level: Importance level of the span (info, warning, error) 1471 status_message: Optional status message for the span 1472 1473 Returns: 1474 The Langfuse Event object 1475 1476 Example: 1477 ```python 1478 event = langfuse.create_event(name="process-event") 1479 ``` 1480 """ 1481 timestamp = time_ns() 1482 1483 if trace_context: 1484 trace_id = trace_context.get("trace_id", None) 1485 parent_span_id = trace_context.get("parent_span_id", None) 1486 1487 if trace_id: 1488 remote_parent_span = self._create_remote_parent_span( 1489 trace_id=trace_id, parent_span_id=parent_span_id 1490 ) 1491 1492 with otel_trace_api.use_span( 1493 cast(otel_trace_api.Span, remote_parent_span) 1494 ): 1495 otel_span = self._otel_tracer.start_span( 1496 name=name, start_time=timestamp 1497 ) 1498 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1499 1500 return cast( 1501 LangfuseEvent, 1502 LangfuseEvent( 1503 otel_span=otel_span, 1504 langfuse_client=self, 1505 environment=self._environment, 1506 input=input, 1507 output=output, 1508 metadata=metadata, 1509 version=version, 1510 level=level, 1511 status_message=status_message, 1512 ).end(end_time=timestamp), 1513 ) 1514 1515 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1516 1517 return cast( 1518 LangfuseEvent, 1519 LangfuseEvent( 1520 otel_span=otel_span, 1521 langfuse_client=self, 1522 environment=self._environment, 1523 input=input, 1524 output=output, 1525 metadata=metadata, 1526 version=version, 1527 level=level, 1528 status_message=status_message, 1529 ).end(end_time=timestamp), 1530 )
Create a new Langfuse observation of type 'EVENT'.
The created Langfuse Event observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the span (e.g., function or operation name)
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Returns:
The Langfuse Event object
Example:
event = langfuse.create_event(name="process-event")
1619 @staticmethod 1620 def create_trace_id(*, seed: Optional[str] = None) -> str: 1621 """Create a unique trace ID for use with Langfuse. 1622 1623 This method generates a unique trace ID for use with various Langfuse APIs. 1624 It can either generate a random ID or create a deterministic ID based on 1625 a seed string. 1626 1627 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1628 This method ensures the generated ID meets this requirement. If you need to 1629 correlate an external ID with a Langfuse trace ID, use the external ID as the 1630 seed to get a valid, deterministic Langfuse trace ID. 1631 1632 Args: 1633 seed: Optional string to use as a seed for deterministic ID generation. 1634 If provided, the same seed will always produce the same ID. 1635 If not provided, a random ID will be generated. 1636 1637 Returns: 1638 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1639 1640 Example: 1641 ```python 1642 # Generate a random trace ID 1643 trace_id = langfuse.create_trace_id() 1644 1645 # Generate a deterministic ID based on a seed 1646 session_trace_id = langfuse.create_trace_id(seed="session-456") 1647 1648 # Correlate an external ID with a Langfuse trace ID 1649 external_id = "external-system-123456" 1650 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1651 1652 # Use the ID with trace context 1653 with langfuse.start_as_current_observation( 1654 name="process-request", 1655 trace_context={"trace_id": trace_id} 1656 ) as span: 1657 # Operation will be part of the specific trace 1658 pass 1659 ``` 1660 """ 1661 if not seed: 1662 trace_id_int = RandomIdGenerator().generate_trace_id() 1663 1664 return Langfuse._format_otel_trace_id(trace_id_int) 1665 1666 return sha256(seed.encode("utf-8")).digest()[:16].hex()
Create a unique trace ID for use with Langfuse.
This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.
Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.
Arguments:
- seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:
A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
Example:
# Generate a random trace ID trace_id = langfuse.create_trace_id() # Generate a deterministic ID based on a seed session_trace_id = langfuse.create_trace_id(seed="session-456") # Correlate an external ID with a Langfuse trace ID external_id = "external-system-123456" correlated_trace_id = langfuse.create_trace_id(seed=external_id) # Use the ID with trace context with langfuse.start_as_current_observation( name="process-request", trace_context={"trace_id": trace_id} ) as span: # Operation will be part of the specific trace pass
1744 def create_score( 1745 self, 1746 *, 1747 name: str, 1748 value: Union[float, str], 1749 session_id: Optional[str] = None, 1750 dataset_run_id: Optional[str] = None, 1751 trace_id: Optional[str] = None, 1752 observation_id: Optional[str] = None, 1753 score_id: Optional[str] = None, 1754 data_type: Optional[ScoreDataType] = None, 1755 comment: Optional[str] = None, 1756 config_id: Optional[str] = None, 1757 metadata: Optional[Any] = None, 1758 timestamp: Optional[datetime] = None, 1759 ) -> None: 1760 """Create a score for a specific trace or observation. 1761 1762 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1763 used to track quality metrics, user feedback, or automated evaluations. 1764 1765 Args: 1766 name: Name of the score (e.g., "relevance", "accuracy") 1767 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 1768 session_id: ID of the Langfuse session to associate the score with 1769 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1770 trace_id: ID of the Langfuse trace to associate the score with 1771 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1772 score_id: Optional custom ID for the score (auto-generated if not provided) 1773 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 1774 comment: Optional comment or explanation for the score 1775 config_id: Optional ID of a score config defined in Langfuse 1776 metadata: Optional metadata to be attached to the score 1777 timestamp: Optional timestamp for the score (defaults to current UTC time) 1778 1779 Example: 1780 ```python 1781 # Create a numeric score for accuracy 1782 langfuse.create_score( 1783 name="accuracy", 1784 value=0.92, 1785 trace_id="abcdef1234567890abcdef1234567890", 1786 data_type="NUMERIC", 1787 comment="High accuracy with minor irrelevant details" 1788 ) 1789 1790 # Create a categorical score for sentiment 1791 langfuse.create_score( 1792 name="sentiment", 1793 value="positive", 1794 trace_id="abcdef1234567890abcdef1234567890", 1795 observation_id="abcdef1234567890", 1796 data_type="CATEGORICAL" 1797 ) 1798 ``` 1799 """ 1800 if not self._tracing_enabled: 1801 return 1802 1803 score_id = score_id or self._create_observation_id() 1804 1805 try: 1806 new_body = ScoreBody( 1807 id=score_id, 1808 session_id=session_id, 1809 datasetRunId=dataset_run_id, 1810 traceId=trace_id, 1811 observationId=observation_id, 1812 name=name, 1813 value=value, 1814 dataType=data_type, # type: ignore 1815 comment=comment, 1816 configId=config_id, 1817 environment=self._environment, 1818 metadata=metadata, 1819 ) 1820 1821 event = { 1822 "id": self.create_trace_id(), 1823 "type": "score-create", 1824 "timestamp": timestamp or _get_timestamp(), 1825 "body": new_body, 1826 } 1827 1828 if self._resources is not None: 1829 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1830 force_sample = ( 1831 not self._is_valid_trace_id(trace_id) if trace_id else True 1832 ) 1833 1834 self._resources.add_score_task( 1835 event, 1836 force_sample=force_sample, 1837 ) 1838 1839 except Exception as e: 1840 langfuse_logger.exception( 1841 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1842 )
Create a score for a specific trace or observation.
This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
- session_id: ID of the Langfuse session to associate the score with
- dataset_run_id: ID of the Langfuse dataset run to associate the score with
- trace_id: ID of the Langfuse trace to associate the score with
- observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
- timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy langfuse.create_score( name="accuracy", value=0.92, trace_id="abcdef1234567890abcdef1234567890", data_type="NUMERIC", comment="High accuracy with minor irrelevant details" ) # Create a categorical score for sentiment langfuse.create_score( name="sentiment", value="positive", trace_id="abcdef1234567890abcdef1234567890", observation_id="abcdef1234567890", data_type="CATEGORICAL" )
1903 def score_current_span( 1904 self, 1905 *, 1906 name: str, 1907 value: Union[float, str], 1908 score_id: Optional[str] = None, 1909 data_type: Optional[ScoreDataType] = None, 1910 comment: Optional[str] = None, 1911 config_id: Optional[str] = None, 1912 metadata: Optional[Any] = None, 1913 ) -> None: 1914 """Create a score for the current active span. 1915 1916 This method scores the currently active span in the context. It's a convenient 1917 way to score the current operation without needing to know its trace and span IDs. 1918 1919 Args: 1920 name: Name of the score (e.g., "relevance", "accuracy") 1921 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 1922 score_id: Optional custom ID for the score (auto-generated if not provided) 1923 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 1924 comment: Optional comment or explanation for the score 1925 config_id: Optional ID of a score config defined in Langfuse 1926 metadata: Optional metadata to be attached to the score 1927 1928 Example: 1929 ```python 1930 with langfuse.start_as_current_generation(name="answer-query") as generation: 1931 # Generate answer 1932 response = generate_answer(...) 1933 generation.update(output=response) 1934 1935 # Score the generation 1936 langfuse.score_current_span( 1937 name="relevance", 1938 value=0.85, 1939 data_type="NUMERIC", 1940 comment="Mostly relevant but contains some tangential information", 1941 metadata={"model": "gpt-4", "prompt_version": "v2"} 1942 ) 1943 ``` 1944 """ 1945 current_span = self._get_current_otel_span() 1946 1947 if current_span is not None: 1948 trace_id = self._get_otel_trace_id(current_span) 1949 observation_id = self._get_otel_span_id(current_span) 1950 1951 langfuse_logger.info( 1952 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 1953 ) 1954 1955 self.create_score( 1956 trace_id=trace_id, 1957 observation_id=observation_id, 1958 name=name, 1959 value=cast(str, value), 1960 score_id=score_id, 1961 data_type=cast(Literal["CATEGORICAL"], data_type), 1962 comment=comment, 1963 config_id=config_id, 1964 metadata=metadata, 1965 )
Create a score for the current active span.
This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Generate answer response = generate_answer(...) generation.update(output=response) # Score the generation langfuse.score_current_span( name="relevance", value=0.85, data_type="NUMERIC", comment="Mostly relevant but contains some tangential information", metadata={"model": "gpt-4", "prompt_version": "v2"} )
1993 def score_current_trace( 1994 self, 1995 *, 1996 name: str, 1997 value: Union[float, str], 1998 score_id: Optional[str] = None, 1999 data_type: Optional[ScoreDataType] = None, 2000 comment: Optional[str] = None, 2001 config_id: Optional[str] = None, 2002 metadata: Optional[Any] = None, 2003 ) -> None: 2004 """Create a score for the current trace. 2005 2006 This method scores the trace of the currently active span. Unlike score_current_span, 2007 this method associates the score with the entire trace rather than a specific span. 2008 It's useful for scoring overall performance or quality of the entire operation. 2009 2010 Args: 2011 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2012 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL) 2013 score_id: Optional custom ID for the score (auto-generated if not provided) 2014 data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL) 2015 comment: Optional comment or explanation for the score 2016 config_id: Optional ID of a score config defined in Langfuse 2017 metadata: Optional metadata to be attached to the score 2018 2019 Example: 2020 ```python 2021 with langfuse.start_as_current_observation(name="process-user-request") as span: 2022 # Process request 2023 result = process_complete_request() 2024 span.update(output=result) 2025 2026 # Score the overall trace 2027 langfuse.score_current_trace( 2028 name="overall_quality", 2029 value=0.95, 2030 data_type="NUMERIC", 2031 comment="High quality end-to-end response", 2032 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2033 ) 2034 ``` 2035 """ 2036 current_span = self._get_current_otel_span() 2037 2038 if current_span is not None: 2039 trace_id = self._get_otel_trace_id(current_span) 2040 2041 langfuse_logger.info( 2042 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2043 ) 2044 2045 self.create_score( 2046 trace_id=trace_id, 2047 name=name, 2048 value=cast(str, value), 2049 score_id=score_id, 2050 data_type=cast(Literal["CATEGORICAL"], data_type), 2051 comment=comment, 2052 config_id=config_id, 2053 metadata=metadata, 2054 )
Create a score for the current trace.
This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.
Arguments:
- name: Name of the score (e.g., "user_satisfaction", "overall_quality")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span: # Process request result = process_complete_request() span.update(output=result) # Score the overall trace langfuse.score_current_trace( name="overall_quality", value=0.95, data_type="NUMERIC", comment="High quality end-to-end response", metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} )
2056 def flush(self) -> None: 2057 """Force flush all pending spans and events to the Langfuse API. 2058 2059 This method manually flushes any pending spans, scores, and other events to the 2060 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2061 before proceeding, without waiting for the automatic flush interval. 2062 2063 Example: 2064 ```python 2065 # Record some spans and scores 2066 with langfuse.start_as_current_observation(name="operation") as span: 2067 # Do work... 2068 pass 2069 2070 # Ensure all data is sent to Langfuse before proceeding 2071 langfuse.flush() 2072 2073 # Continue with other work 2074 ``` 2075 """ 2076 if self._resources is not None: 2077 self._resources.flush()
Force flush all pending spans and events to the Langfuse API.
This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.
Example:
# Record some spans and scores with langfuse.start_as_current_observation(name="operation") as span: # Do work... pass # Ensure all data is sent to Langfuse before proceeding langfuse.flush() # Continue with other work
2079 def shutdown(self) -> None: 2080 """Shut down the Langfuse client and flush all pending data. 2081 2082 This method cleanly shuts down the Langfuse client, ensuring all pending data 2083 is flushed to the API and all background threads are properly terminated. 2084 2085 It's important to call this method when your application is shutting down to 2086 prevent data loss and resource leaks. For most applications, using the client 2087 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2088 2089 Example: 2090 ```python 2091 # Initialize Langfuse 2092 langfuse = Langfuse(public_key="...", secret_key="...") 2093 2094 # Use Langfuse throughout your application 2095 # ... 2096 2097 # When application is shutting down 2098 langfuse.shutdown() 2099 ``` 2100 """ 2101 if self._resources is not None: 2102 self._resources.shutdown()
Shut down the Langfuse client and flush all pending data.
This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.
It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.
Example:
# Initialize Langfuse langfuse = Langfuse(public_key="...", secret_key="...") # Use Langfuse throughout your application # ... # When application is shutting down langfuse.shutdown()
2104 def get_current_trace_id(self) -> Optional[str]: 2105 """Get the trace ID of the current active span. 2106 2107 This method retrieves the trace ID from the currently active span in the context. 2108 It can be used to get the trace ID for referencing in logs, external systems, 2109 or for creating related operations. 2110 2111 Returns: 2112 The current trace ID as a 32-character lowercase hexadecimal string, 2113 or None if there is no active span. 2114 2115 Example: 2116 ```python 2117 with langfuse.start_as_current_observation(name="process-request") as span: 2118 # Get the current trace ID for reference 2119 trace_id = langfuse.get_current_trace_id() 2120 2121 # Use it for external correlation 2122 log.info(f"Processing request with trace_id: {trace_id}") 2123 2124 # Or pass to another system 2125 external_system.process(data, trace_id=trace_id) 2126 ``` 2127 """ 2128 if not self._tracing_enabled: 2129 langfuse_logger.debug( 2130 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2131 ) 2132 return None 2133 2134 current_otel_span = self._get_current_otel_span() 2135 2136 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
Get the trace ID of the current active span.
This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.
Returns:
The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-request") as span: # Get the current trace ID for reference trace_id = langfuse.get_current_trace_id() # Use it for external correlation log.info(f"Processing request with trace_id: {trace_id}") # Or pass to another system external_system.process(data, trace_id=trace_id)
2138 def get_current_observation_id(self) -> Optional[str]: 2139 """Get the observation ID (span ID) of the current active span. 2140 2141 This method retrieves the observation ID from the currently active span in the context. 2142 It can be used to get the observation ID for referencing in logs, external systems, 2143 or for creating scores or other related operations. 2144 2145 Returns: 2146 The current observation ID as a 16-character lowercase hexadecimal string, 2147 or None if there is no active span. 2148 2149 Example: 2150 ```python 2151 with langfuse.start_as_current_observation(name="process-user-query") as span: 2152 # Get the current observation ID 2153 observation_id = langfuse.get_current_observation_id() 2154 2155 # Store it for later reference 2156 cache.set(f"query_{query_id}_observation", observation_id) 2157 2158 # Process the query... 2159 ``` 2160 """ 2161 if not self._tracing_enabled: 2162 langfuse_logger.debug( 2163 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2164 ) 2165 return None 2166 2167 current_otel_span = self._get_current_otel_span() 2168 2169 return self._get_otel_span_id(current_otel_span) if current_otel_span else None
Get the observation ID (span ID) of the current active span.
This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.
Returns:
The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-user-query") as span: # Get the current observation ID observation_id = langfuse.get_current_observation_id() # Store it for later reference cache.set(f"query_{query_id}_observation", observation_id) # Process the query...
2182 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2183 """Get the URL to view a trace in the Langfuse UI. 2184 2185 This method generates a URL that links directly to a trace in the Langfuse UI. 2186 It's useful for providing links in logs, notifications, or debugging tools. 2187 2188 Args: 2189 trace_id: Optional trace ID to generate a URL for. If not provided, 2190 the trace ID of the current active span will be used. 2191 2192 Returns: 2193 A URL string pointing to the trace in the Langfuse UI, 2194 or None if the project ID couldn't be retrieved or no trace ID is available. 2195 2196 Example: 2197 ```python 2198 # Get URL for the current trace 2199 with langfuse.start_as_current_observation(name="process-request") as span: 2200 trace_url = langfuse.get_trace_url() 2201 log.info(f"Processing trace: {trace_url}") 2202 2203 # Get URL for a specific trace 2204 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2205 send_notification(f"Review needed for trace: {specific_trace_url}") 2206 ``` 2207 """ 2208 final_trace_id = trace_id or self.get_current_trace_id() 2209 if not final_trace_id: 2210 return None 2211 2212 project_id = self._get_project_id() 2213 2214 return ( 2215 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2216 if project_id and final_trace_id 2217 else None 2218 )
Get the URL to view a trace in the Langfuse UI.
This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.
Arguments:
- trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:
A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.
Example:
# Get URL for the current trace with langfuse.start_as_current_observation(name="process-request") as span: trace_url = langfuse.get_trace_url() log.info(f"Processing trace: {trace_url}") # Get URL for a specific trace specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") send_notification(f"Review needed for trace: {specific_trace_url}")
2220 def get_dataset( 2221 self, 2222 name: str, 2223 *, 2224 fetch_items_page_size: Optional[int] = 50, 2225 version: Optional[datetime] = None, 2226 ) -> "DatasetClient": 2227 """Fetch a dataset by its name. 2228 2229 Args: 2230 name (str): The name of the dataset to fetch. 2231 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2232 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2233 If provided, returns the state of items at the specified UTC timestamp. 2234 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2235 2236 Returns: 2237 DatasetClient: The dataset with the given name. 2238 """ 2239 try: 2240 langfuse_logger.debug(f"Getting datasets {name}") 2241 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2242 2243 dataset_items = [] 2244 page = 1 2245 2246 while True: 2247 new_items = self.api.dataset_items.list( 2248 dataset_name=self._url_encode(name, is_url_param=True), 2249 page=page, 2250 limit=fetch_items_page_size, 2251 version=version, 2252 ) 2253 dataset_items.extend(new_items.data) 2254 2255 if new_items.meta.total_pages <= page: 2256 break 2257 2258 page += 1 2259 2260 return DatasetClient( 2261 dataset=dataset, 2262 items=dataset_items, 2263 version=version, 2264 langfuse_client=self, 2265 ) 2266 2267 except Error as e: 2268 handle_fern_exception(e) 2269 raise e
Fetch a dataset by its name.
Arguments:
- name (str): The name of the dataset to fetch.
- fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
- version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:
DatasetClient: The dataset with the given name.
2271 def get_dataset_run( 2272 self, *, dataset_name: str, run_name: str 2273 ) -> DatasetRunWithItems: 2274 """Fetch a dataset run by dataset name and run name. 2275 2276 Args: 2277 dataset_name (str): The name of the dataset. 2278 run_name (str): The name of the run. 2279 2280 Returns: 2281 DatasetRunWithItems: The dataset run with its items. 2282 """ 2283 try: 2284 return cast( 2285 DatasetRunWithItems, 2286 self.api.datasets.get_run( 2287 dataset_name=self._url_encode(dataset_name), 2288 run_name=self._url_encode(run_name), 2289 request_options=None, 2290 ), 2291 ) 2292 except Error as e: 2293 handle_fern_exception(e) 2294 raise e
Fetch a dataset run by dataset name and run name.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DatasetRunWithItems: The dataset run with its items.
2296 def get_dataset_runs( 2297 self, 2298 *, 2299 dataset_name: str, 2300 page: Optional[int] = None, 2301 limit: Optional[int] = None, 2302 ) -> PaginatedDatasetRuns: 2303 """Fetch all runs for a dataset. 2304 2305 Args: 2306 dataset_name (str): The name of the dataset. 2307 page (Optional[int]): Page number, starts at 1. 2308 limit (Optional[int]): Limit of items per page. 2309 2310 Returns: 2311 PaginatedDatasetRuns: Paginated list of dataset runs. 2312 """ 2313 try: 2314 return cast( 2315 PaginatedDatasetRuns, 2316 self.api.datasets.get_runs( 2317 dataset_name=self._url_encode(dataset_name), 2318 page=page, 2319 limit=limit, 2320 request_options=None, 2321 ), 2322 ) 2323 except Error as e: 2324 handle_fern_exception(e) 2325 raise e
Fetch all runs for a dataset.
Arguments:
- dataset_name (str): The name of the dataset.
- page (Optional[int]): Page number, starts at 1.
- limit (Optional[int]): Limit of items per page.
Returns:
PaginatedDatasetRuns: Paginated list of dataset runs.
2327 def delete_dataset_run( 2328 self, *, dataset_name: str, run_name: str 2329 ) -> DeleteDatasetRunResponse: 2330 """Delete a dataset run and all its run items. This action is irreversible. 2331 2332 Args: 2333 dataset_name (str): The name of the dataset. 2334 run_name (str): The name of the run. 2335 2336 Returns: 2337 DeleteDatasetRunResponse: Confirmation of deletion. 2338 """ 2339 try: 2340 return cast( 2341 DeleteDatasetRunResponse, 2342 self.api.datasets.delete_run( 2343 dataset_name=self._url_encode(dataset_name), 2344 run_name=self._url_encode(run_name), 2345 request_options=None, 2346 ), 2347 ) 2348 except Error as e: 2349 handle_fern_exception(e) 2350 raise e
Delete a dataset run and all its run items. This action is irreversible.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DeleteDatasetRunResponse: Confirmation of deletion.
2352 def run_experiment( 2353 self, 2354 *, 2355 name: str, 2356 run_name: Optional[str] = None, 2357 description: Optional[str] = None, 2358 data: ExperimentData, 2359 task: TaskFunction, 2360 evaluators: List[EvaluatorFunction] = [], 2361 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2362 run_evaluators: List[RunEvaluatorFunction] = [], 2363 max_concurrency: int = 50, 2364 metadata: Optional[Dict[str, str]] = None, 2365 _dataset_version: Optional[datetime] = None, 2366 ) -> ExperimentResult: 2367 """Run an experiment on a dataset with automatic tracing and evaluation. 2368 2369 This method executes a task function on each item in the provided dataset, 2370 automatically traces all executions with Langfuse for observability, runs 2371 item-level and run-level evaluators on the outputs, and returns comprehensive 2372 results with evaluation metrics. 2373 2374 The experiment system provides: 2375 - Automatic tracing of all task executions 2376 - Concurrent processing with configurable limits 2377 - Comprehensive error handling that isolates failures 2378 - Integration with Langfuse datasets for experiment tracking 2379 - Flexible evaluation framework supporting both sync and async evaluators 2380 2381 Args: 2382 name: Human-readable name for the experiment. Used for identification 2383 in the Langfuse UI. 2384 run_name: Optional exact name for the experiment run. If provided, this will be 2385 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2386 If not provided, this will default to the experiment name appended with an ISO timestamp. 2387 description: Optional description explaining the experiment's purpose, 2388 methodology, or expected outcomes. 2389 data: Array of data items to process. Can be either: 2390 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2391 - List of Langfuse DatasetItem objects from dataset.items 2392 task: Function that processes each data item and returns output. 2393 Must accept 'item' as keyword argument and can return sync or async results. 2394 The task function signature should be: task(*, item, **kwargs) -> Any 2395 evaluators: List of functions to evaluate each item's output individually. 2396 Each evaluator receives input, output, expected_output, and metadata. 2397 Can return single Evaluation dict or list of Evaluation dicts. 2398 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2399 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2400 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2401 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2402 run_evaluators: List of functions to evaluate the entire experiment run. 2403 Each run evaluator receives all item_results and can compute aggregate metrics. 2404 Useful for calculating averages, distributions, or cross-item comparisons. 2405 max_concurrency: Maximum number of concurrent task executions (default: 50). 2406 Controls the number of items processed simultaneously. Adjust based on 2407 API rate limits and system resources. 2408 metadata: Optional metadata dictionary to attach to all experiment traces. 2409 This metadata will be included in every trace created during the experiment. 2410 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2411 2412 Returns: 2413 ExperimentResult containing: 2414 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2415 - item_results: List of results for each processed item with outputs and evaluations 2416 - run_evaluations: List of aggregate evaluation results for the entire run 2417 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2418 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2419 2420 Raises: 2421 ValueError: If required parameters are missing or invalid 2422 Exception: If experiment setup fails (individual item failures are handled gracefully) 2423 2424 Examples: 2425 Basic experiment with local data: 2426 ```python 2427 def summarize_text(*, item, **kwargs): 2428 return f"Summary: {item['input'][:50]}..." 2429 2430 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2431 return { 2432 "name": "output_length", 2433 "value": len(output), 2434 "comment": f"Output contains {len(output)} characters" 2435 } 2436 2437 result = langfuse.run_experiment( 2438 name="Text Summarization Test", 2439 description="Evaluate summarization quality and length", 2440 data=[ 2441 {"input": "Long article text...", "expected_output": "Expected summary"}, 2442 {"input": "Another article...", "expected_output": "Another summary"} 2443 ], 2444 task=summarize_text, 2445 evaluators=[length_evaluator] 2446 ) 2447 2448 print(f"Processed {len(result.item_results)} items") 2449 for item_result in result.item_results: 2450 print(f"Input: {item_result.item['input']}") 2451 print(f"Output: {item_result.output}") 2452 print(f"Evaluations: {item_result.evaluations}") 2453 ``` 2454 2455 Advanced experiment with async task and multiple evaluators: 2456 ```python 2457 async def llm_task(*, item, **kwargs): 2458 # Simulate async LLM call 2459 response = await openai_client.chat.completions.create( 2460 model="gpt-4", 2461 messages=[{"role": "user", "content": item["input"]}] 2462 ) 2463 return response.choices[0].message.content 2464 2465 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2466 if expected_output and expected_output.lower() in output.lower(): 2467 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2468 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2469 2470 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2471 # Simulate toxicity check 2472 toxicity_score = check_toxicity(output) # Your toxicity checker 2473 return { 2474 "name": "toxicity", 2475 "value": toxicity_score, 2476 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2477 } 2478 2479 def average_accuracy(*, item_results, **kwargs): 2480 accuracies = [ 2481 eval.value for result in item_results 2482 for eval in result.evaluations 2483 if eval.name == "accuracy" 2484 ] 2485 return { 2486 "name": "average_accuracy", 2487 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2488 "comment": f"Average accuracy across {len(accuracies)} items" 2489 } 2490 2491 result = langfuse.run_experiment( 2492 name="LLM Safety and Accuracy Test", 2493 description="Evaluate model accuracy and safety across diverse prompts", 2494 data=test_dataset, # Your dataset items 2495 task=llm_task, 2496 evaluators=[accuracy_evaluator, toxicity_evaluator], 2497 run_evaluators=[average_accuracy], 2498 max_concurrency=5, # Limit concurrent API calls 2499 metadata={"model": "gpt-4", "temperature": 0.7} 2500 ) 2501 ``` 2502 2503 Using with Langfuse datasets: 2504 ```python 2505 # Get dataset from Langfuse 2506 dataset = langfuse.get_dataset("my-eval-dataset") 2507 2508 result = dataset.run_experiment( 2509 name="Production Model Evaluation", 2510 description="Monthly evaluation of production model performance", 2511 task=my_production_task, 2512 evaluators=[accuracy_evaluator, latency_evaluator] 2513 ) 2514 2515 # Results automatically linked to dataset in Langfuse UI 2516 print(f"View results: {result['dataset_run_url']}") 2517 ``` 2518 2519 Note: 2520 - Task and evaluator functions can be either synchronous or asynchronous 2521 - Individual item failures are logged but don't stop the experiment 2522 - All executions are automatically traced and visible in Langfuse UI 2523 - When using Langfuse datasets, results are automatically linked for easy comparison 2524 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2525 - Async execution is handled automatically with smart event loop detection 2526 """ 2527 return cast( 2528 ExperimentResult, 2529 run_async_safely( 2530 self._run_experiment_async( 2531 name=name, 2532 run_name=self._create_experiment_run_name( 2533 name=name, run_name=run_name 2534 ), 2535 description=description, 2536 data=data, 2537 task=task, 2538 evaluators=evaluators or [], 2539 composite_evaluator=composite_evaluator, 2540 run_evaluators=run_evaluators or [], 2541 max_concurrency=max_concurrency, 2542 metadata=metadata, 2543 dataset_version=_dataset_version, 2544 ), 2545 ), 2546 )
Run an experiment on a dataset with automatic tracing and evaluation.
This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.
The experiment system provides:
- Automatic tracing of all task executions
- Concurrent processing with configurable limits
- Comprehensive error handling that isolates failures
- Integration with Langfuse datasets for experiment tracking
- Flexible evaluation framework supporting both sync and async evaluators
Arguments:
- name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
- run_name: Optional exact name for the experiment run. If provided, this will be
used as the exact dataset run name if the
datacontains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp. - description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
- data: Array of data items to process. Can be either:
- List of dict-like items with 'input', 'expected_output', 'metadata' keys
- List of Langfuse DatasetItem objects from dataset.items
- task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(, item, *kwargs) -> Any
- evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
- composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
- run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
- max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
- metadata: Optional metadata dictionary to attach to all experiment traces.
This metadata will be included in every trace created during the experiment.
If
dataare Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:
ExperimentResult containing:
- run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
- item_results: List of results for each processed item with outputs and evaluations
- run_evaluations: List of aggregate evaluation results for the entire run
- dataset_run_id: ID of the dataset run (if using Langfuse datasets)
- dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
- ValueError: If required parameters are missing or invalid
- Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:
Basic experiment with local data:
def summarize_text(*, item, **kwargs): return f"Summary: {item['input'][:50]}..." def length_evaluator(*, input, output, expected_output=None, **kwargs): return { "name": "output_length", "value": len(output), "comment": f"Output contains {len(output)} characters" } result = langfuse.run_experiment( name="Text Summarization Test", description="Evaluate summarization quality and length", data=[ {"input": "Long article text...", "expected_output": "Expected summary"}, {"input": "Another article...", "expected_output": "Another summary"} ], task=summarize_text, evaluators=[length_evaluator] ) print(f"Processed {len(result.item_results)} items") for item_result in result.item_results: print(f"Input: {item_result.item['input']}") print(f"Output: {item_result.output}") print(f"Evaluations: {item_result.evaluations}")Advanced experiment with async task and multiple evaluators:
async def llm_task(*, item, **kwargs): # Simulate async LLM call response = await openai_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": item["input"]}] ) return response.choices[0].message.content def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if expected_output and expected_output.lower() in output.lower(): return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): # Simulate toxicity check toxicity_score = check_toxicity(output) # Your toxicity checker return { "name": "toxicity", "value": toxicity_score, "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" } def average_accuracy(*, item_results, **kwargs): accuracies = [ eval.value for result in item_results for eval in result.evaluations if eval.name == "accuracy" ] return { "name": "average_accuracy", "value": sum(accuracies) / len(accuracies) if accuracies else 0, "comment": f"Average accuracy across {len(accuracies)} items" } result = langfuse.run_experiment( name="LLM Safety and Accuracy Test", description="Evaluate model accuracy and safety across diverse prompts", data=test_dataset, # Your dataset items task=llm_task, evaluators=[accuracy_evaluator, toxicity_evaluator], run_evaluators=[average_accuracy], max_concurrency=5, # Limit concurrent API calls metadata={"model": "gpt-4", "temperature": 0.7} )Using with Langfuse datasets:
# Get dataset from Langfuse dataset = langfuse.get_dataset("my-eval-dataset") result = dataset.run_experiment( name="Production Model Evaluation", description="Monthly evaluation of production model performance", task=my_production_task, evaluators=[accuracy_evaluator, latency_evaluator] ) # Results automatically linked to dataset in Langfuse UI print(f"View results: {result['dataset_run_url']}")
Note:
- Task and evaluator functions can be either synchronous or asynchronous
- Individual item failures are logged but don't stop the experiment
- All executions are automatically traced and visible in Langfuse UI
- When using Langfuse datasets, results are automatically linked for easy comparison
- This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
- Async execution is handled automatically with smart event loop detection
2892 def run_batched_evaluation( 2893 self, 2894 *, 2895 scope: Literal["traces", "observations"], 2896 mapper: MapperFunction, 2897 filter: Optional[str] = None, 2898 fetch_batch_size: int = 50, 2899 fetch_trace_fields: Optional[str] = None, 2900 max_items: Optional[int] = None, 2901 max_retries: int = 3, 2902 evaluators: List[EvaluatorFunction], 2903 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2904 max_concurrency: int = 5, 2905 metadata: Optional[Dict[str, Any]] = None, 2906 _add_observation_scores_to_trace: bool = False, 2907 _additional_trace_tags: Optional[List[str]] = None, 2908 resume_from: Optional[BatchEvaluationResumeToken] = None, 2909 verbose: bool = False, 2910 ) -> BatchEvaluationResult: 2911 """Fetch traces or observations and run evaluations on each item. 2912 2913 This method provides a powerful way to evaluate existing data in Langfuse at scale. 2914 It fetches items based on filters, transforms them using a mapper function, runs 2915 evaluators on each item, and creates scores that are linked back to the original 2916 entities. This is ideal for: 2917 2918 - Running evaluations on production traces after deployment 2919 - Backtesting new evaluation metrics on historical data 2920 - Batch scoring of observations for quality monitoring 2921 - Periodic evaluation runs on recent data 2922 2923 The method uses a streaming/pipeline approach to process items in batches, making 2924 it memory-efficient for large datasets. It includes comprehensive error handling, 2925 retry logic, and resume capability for long-running evaluations. 2926 2927 Args: 2928 scope: The type of items to evaluate. Must be one of: 2929 - "traces": Evaluate complete traces with all their observations 2930 - "observations": Evaluate individual observations (spans, generations, events) 2931 mapper: Function that transforms API response objects into evaluator inputs. 2932 Receives a trace/observation object and returns an EvaluatorInputs 2933 instance with input, output, expected_output, and metadata fields. 2934 Can be sync or async. 2935 evaluators: List of evaluation functions to run on each item. Each evaluator 2936 receives the mapped inputs and returns Evaluation object(s). Evaluator 2937 failures are logged but don't stop the batch evaluation. 2938 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 2939 - '{"tags": ["production"]}' 2940 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 2941 Default: None (fetches all items). 2942 fetch_batch_size: Number of items to fetch per API call and hold in memory. 2943 Larger values may be faster but use more memory. Default: 50. 2944 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 2945 max_items: Maximum total number of items to process. If None, processes all 2946 items matching the filter. Useful for testing or limiting evaluation runs. 2947 Default: None (process all). 2948 max_concurrency: Maximum number of items to evaluate concurrently. Controls 2949 parallelism and resource usage. Default: 5. 2950 composite_evaluator: Optional function that creates a composite score from 2951 item-level evaluations. Receives the original item and its evaluations, 2952 returns a single Evaluation. Useful for weighted averages or combined metrics. 2953 Default: None. 2954 metadata: Optional metadata dict to add to all created scores. Useful for 2955 tracking evaluation runs, versions, or other context. Default: None. 2956 max_retries: Maximum number of retry attempts for failed batch fetches. 2957 Uses exponential backoff (1s, 2s, 4s). Default: 3. 2958 verbose: If True, logs progress information to console. Useful for monitoring 2959 long-running evaluations. Default: False. 2960 resume_from: Optional resume token from a previous incomplete run. Allows 2961 continuing evaluation after interruption or failure. Default: None. 2962 2963 2964 Returns: 2965 BatchEvaluationResult containing: 2966 - total_items_fetched: Number of items fetched from API 2967 - total_items_processed: Number of items successfully evaluated 2968 - total_items_failed: Number of items that failed evaluation 2969 - total_scores_created: Scores created by item-level evaluators 2970 - total_composite_scores_created: Scores created by composite evaluator 2971 - total_evaluations_failed: Individual evaluator failures 2972 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 2973 - resume_token: Token for resuming if incomplete (None if completed) 2974 - completed: True if all items processed 2975 - duration_seconds: Total execution time 2976 - failed_item_ids: IDs of items that failed 2977 - error_summary: Error types and counts 2978 - has_more_items: True if max_items reached but more exist 2979 2980 Raises: 2981 ValueError: If invalid scope is provided. 2982 2983 Examples: 2984 Basic trace evaluation: 2985 ```python 2986 from langfuse import Langfuse, EvaluatorInputs, Evaluation 2987 2988 client = Langfuse() 2989 2990 # Define mapper to extract fields from traces 2991 def trace_mapper(trace): 2992 return EvaluatorInputs( 2993 input=trace.input, 2994 output=trace.output, 2995 expected_output=None, 2996 metadata={"trace_id": trace.id} 2997 ) 2998 2999 # Define evaluator 3000 def length_evaluator(*, input, output, expected_output, metadata): 3001 return Evaluation( 3002 name="output_length", 3003 value=len(output) if output else 0 3004 ) 3005 3006 # Run batch evaluation 3007 result = client.run_batched_evaluation( 3008 scope="traces", 3009 mapper=trace_mapper, 3010 evaluators=[length_evaluator], 3011 filter='{"tags": ["production"]}', 3012 max_items=1000, 3013 verbose=True 3014 ) 3015 3016 print(f"Processed {result.total_items_processed} traces") 3017 print(f"Created {result.total_scores_created} scores") 3018 ``` 3019 3020 Evaluation with composite scorer: 3021 ```python 3022 def accuracy_evaluator(*, input, output, expected_output, metadata): 3023 # ... evaluation logic 3024 return Evaluation(name="accuracy", value=0.85) 3025 3026 def relevance_evaluator(*, input, output, expected_output, metadata): 3027 # ... evaluation logic 3028 return Evaluation(name="relevance", value=0.92) 3029 3030 def composite_evaluator(*, item, evaluations): 3031 # Weighted average of evaluations 3032 weights = {"accuracy": 0.6, "relevance": 0.4} 3033 total = sum( 3034 e.value * weights.get(e.name, 0) 3035 for e in evaluations 3036 if isinstance(e.value, (int, float)) 3037 ) 3038 return Evaluation( 3039 name="composite_score", 3040 value=total, 3041 comment=f"Weighted average of {len(evaluations)} metrics" 3042 ) 3043 3044 result = client.run_batched_evaluation( 3045 scope="traces", 3046 mapper=trace_mapper, 3047 evaluators=[accuracy_evaluator, relevance_evaluator], 3048 composite_evaluator=composite_evaluator, 3049 filter='{"user_id": "important_user"}', 3050 verbose=True 3051 ) 3052 ``` 3053 3054 Handling incomplete runs with resume: 3055 ```python 3056 # Initial run that may fail or timeout 3057 result = client.run_batched_evaluation( 3058 scope="observations", 3059 mapper=obs_mapper, 3060 evaluators=[my_evaluator], 3061 max_items=10000, 3062 verbose=True 3063 ) 3064 3065 # Check if incomplete 3066 if not result.completed and result.resume_token: 3067 print(f"Processed {result.resume_token.items_processed} items before interruption") 3068 3069 # Resume from where it left off 3070 result = client.run_batched_evaluation( 3071 scope="observations", 3072 mapper=obs_mapper, 3073 evaluators=[my_evaluator], 3074 resume_from=result.resume_token, 3075 verbose=True 3076 ) 3077 3078 print(f"Total items processed: {result.total_items_processed}") 3079 ``` 3080 3081 Monitoring evaluator performance: 3082 ```python 3083 result = client.run_batched_evaluation(...) 3084 3085 for stats in result.evaluator_stats: 3086 success_rate = stats.successful_runs / stats.total_runs 3087 print(f"{stats.name}:") 3088 print(f" Success rate: {success_rate:.1%}") 3089 print(f" Scores created: {stats.total_scores_created}") 3090 3091 if stats.failed_runs > 0: 3092 print(f" â ī¸ Failed {stats.failed_runs} times") 3093 ``` 3094 3095 Note: 3096 - Evaluator failures are logged but don't stop the batch evaluation 3097 - Individual item failures are tracked but don't stop processing 3098 - Fetch failures are retried with exponential backoff 3099 - All scores are automatically flushed to Langfuse at the end 3100 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3101 """ 3102 runner = BatchEvaluationRunner(self) 3103 3104 return cast( 3105 BatchEvaluationResult, 3106 run_async_safely( 3107 runner.run_async( 3108 scope=scope, 3109 mapper=mapper, 3110 evaluators=evaluators, 3111 filter=filter, 3112 fetch_batch_size=fetch_batch_size, 3113 fetch_trace_fields=fetch_trace_fields, 3114 max_items=max_items, 3115 max_concurrency=max_concurrency, 3116 composite_evaluator=composite_evaluator, 3117 metadata=metadata, 3118 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3119 _additional_trace_tags=_additional_trace_tags, 3120 max_retries=max_retries, 3121 verbose=verbose, 3122 resume_from=resume_from, 3123 ) 3124 ), 3125 )
Fetch traces or observations and run evaluations on each item.
This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:
- Running evaluations on production traces after deployment
- Backtesting new evaluation metrics on historical data
- Batch scoring of observations for quality monitoring
- Periodic evaluation runs on recent data
The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.
Arguments:
- scope: The type of items to evaluate. Must be one of:
- "traces": Evaluate complete traces with all their observations
- "observations": Evaluate individual observations (spans, generations, events)
- mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
- evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
- filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
- '{"tags": ["production"]}'
- '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
- fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
- fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
- max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
- max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
- composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
- metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
- max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
- verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
- resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:
BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist
Raises:
- ValueError: If invalid scope is provided.
Examples:
Basic trace evaluation:
from langfuse import Langfuse, EvaluatorInputs, Evaluation client = Langfuse() # Define mapper to extract fields from traces def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, metadata={"trace_id": trace.id} ) # Define evaluator def length_evaluator(*, input, output, expected_output, metadata): return Evaluation( name="output_length", value=len(output) if output else 0 ) # Run batch evaluation result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[length_evaluator], filter='{"tags": ["production"]}', max_items=1000, verbose=True ) print(f"Processed {result.total_items_processed} traces") print(f"Created {result.total_scores_created} scores")Evaluation with composite scorer:
def accuracy_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="accuracy", value=0.85) def relevance_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="relevance", value=0.92) def composite_evaluator(*, item, evaluations): # Weighted average of evaluations weights = {"accuracy": 0.6, "relevance": 0.4} total = sum( e.value * weights.get(e.name, 0) for e in evaluations if isinstance(e.value, (int, float)) ) return Evaluation( name="composite_score", value=total, comment=f"Weighted average of {len(evaluations)} metrics" ) result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[accuracy_evaluator, relevance_evaluator], composite_evaluator=composite_evaluator, filter='{"user_id": "important_user"}', verbose=True )Handling incomplete runs with resume:
# Initial run that may fail or timeout result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], max_items=10000, verbose=True ) # Check if incomplete if not result.completed and result.resume_token: print(f"Processed {result.resume_token.items_processed} items before interruption") # Resume from where it left off result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], resume_from=result.resume_token, verbose=True ) print(f"Total items processed: {result.total_items_processed}")Monitoring evaluator performance:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs print(f"{stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failed {stats.failed_runs} times")
Note:
- Evaluator failures are logged but don't stop the batch evaluation
- Individual item failures are tracked but don't stop processing
- Fetch failures are retried with exponential backoff
- All scores are automatically flushed to Langfuse at the end
- The resume mechanism uses timestamp-based filtering to avoid duplicates
3127 def auth_check(self) -> bool: 3128 """Check if the provided credentials (public and secret key) are valid. 3129 3130 Raises: 3131 Exception: If no projects were found for the provided credentials. 3132 3133 Note: 3134 This method is blocking. It is discouraged to use it in production code. 3135 """ 3136 try: 3137 projects = self.api.projects.get() 3138 langfuse_logger.debug( 3139 f"Auth check successful, found {len(projects.data)} projects" 3140 ) 3141 if len(projects.data) == 0: 3142 raise Exception( 3143 "Auth check failed, no project found for the keys provided." 3144 ) 3145 return True 3146 3147 except AttributeError as e: 3148 langfuse_logger.warning( 3149 f"Auth check failed: Client not properly initialized. Error: {e}" 3150 ) 3151 return False 3152 3153 except Error as e: 3154 handle_fern_exception(e) 3155 raise e
Check if the provided credentials (public and secret key) are valid.
Raises:
- Exception: If no projects were found for the provided credentials.
Note:
This method is blocking. It is discouraged to use it in production code.
3157 def create_dataset( 3158 self, 3159 *, 3160 name: str, 3161 description: Optional[str] = None, 3162 metadata: Optional[Any] = None, 3163 input_schema: Optional[Any] = None, 3164 expected_output_schema: Optional[Any] = None, 3165 ) -> Dataset: 3166 """Create a dataset with the given name on Langfuse. 3167 3168 Args: 3169 name: Name of the dataset to create. 3170 description: Description of the dataset. Defaults to None. 3171 metadata: Additional metadata. Defaults to None. 3172 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3173 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3174 3175 Returns: 3176 Dataset: The created dataset as returned by the Langfuse API. 3177 """ 3178 try: 3179 langfuse_logger.debug(f"Creating datasets {name}") 3180 3181 result = self.api.datasets.create( 3182 name=name, 3183 description=description, 3184 metadata=metadata, 3185 input_schema=input_schema, 3186 expected_output_schema=expected_output_schema, 3187 ) 3188 3189 return cast(Dataset, result) 3190 3191 except Error as e: 3192 handle_fern_exception(e) 3193 raise e
Create a dataset with the given name on Langfuse.
Arguments:
- name: Name of the dataset to create.
- description: Description of the dataset. Defaults to None.
- metadata: Additional metadata. Defaults to None.
- input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
- expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:
Dataset: The created dataset as returned by the Langfuse API.
3195 def create_dataset_item( 3196 self, 3197 *, 3198 dataset_name: str, 3199 input: Optional[Any] = None, 3200 expected_output: Optional[Any] = None, 3201 metadata: Optional[Any] = None, 3202 source_trace_id: Optional[str] = None, 3203 source_observation_id: Optional[str] = None, 3204 status: Optional[DatasetStatus] = None, 3205 id: Optional[str] = None, 3206 ) -> DatasetItem: 3207 """Create a dataset item. 3208 3209 Upserts if an item with id already exists. 3210 3211 Args: 3212 dataset_name: Name of the dataset in which the dataset item should be created. 3213 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3214 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3215 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3216 source_trace_id: Id of the source trace. Defaults to None. 3217 source_observation_id: Id of the source observation. Defaults to None. 3218 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3219 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3220 3221 Returns: 3222 DatasetItem: The created dataset item as returned by the Langfuse API. 3223 3224 Example: 3225 ```python 3226 from langfuse import Langfuse 3227 3228 langfuse = Langfuse() 3229 3230 # Uploading items to the Langfuse dataset named "capital_cities" 3231 langfuse.create_dataset_item( 3232 dataset_name="capital_cities", 3233 input={"input": {"country": "Italy"}}, 3234 expected_output={"expected_output": "Rome"}, 3235 metadata={"foo": "bar"} 3236 ) 3237 ``` 3238 """ 3239 try: 3240 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3241 3242 result = self.api.dataset_items.create( 3243 dataset_name=dataset_name, 3244 input=input, 3245 expected_output=expected_output, 3246 metadata=metadata, 3247 source_trace_id=source_trace_id, 3248 source_observation_id=source_observation_id, 3249 status=status, 3250 id=id, 3251 ) 3252 3253 return cast(DatasetItem, result) 3254 except Error as e: 3255 handle_fern_exception(e) 3256 raise e
Create a dataset item.
Upserts if an item with id already exists.
Arguments:
- dataset_name: Name of the dataset in which the dataset item should be created.
- input: Input data. Defaults to None. Can contain any dict, list or scalar.
- expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
- metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
- source_trace_id: Id of the source trace. Defaults to None.
- source_observation_id: Id of the source observation. Defaults to None.
- status: Status of the dataset item. Defaults to ACTIVE for newly created items.
- id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:
DatasetItem: The created dataset item as returned by the Langfuse API.
Example:
from langfuse import Langfuse langfuse = Langfuse() # Uploading items to the Langfuse dataset named "capital_cities" langfuse.create_dataset_item( dataset_name="capital_cities", input={"input": {"country": "Italy"}}, expected_output={"expected_output": "Rome"}, metadata={"foo": "bar"} )
3258 def resolve_media_references( 3259 self, 3260 *, 3261 obj: Any, 3262 resolve_with: Literal["base64_data_uri"], 3263 max_depth: int = 10, 3264 content_fetch_timeout_seconds: int = 5, 3265 ) -> Any: 3266 """Replace media reference strings in an object with base64 data URIs. 3267 3268 This method recursively traverses an object (up to max_depth) looking for media reference strings 3269 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3270 the provided Langfuse client and replaces the reference string with a base64 data URI. 3271 3272 If fetching media content fails for a reference string, a warning is logged and the reference 3273 string is left unchanged. 3274 3275 Args: 3276 obj: The object to process. Can be a primitive value, array, or nested object. 3277 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3278 resolve_with: The representation of the media content to replace the media reference string with. 3279 Currently only "base64_data_uri" is supported. 3280 max_depth: int: The maximum depth to traverse the object. Default is 10. 3281 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3282 3283 Returns: 3284 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3285 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3286 3287 Example: 3288 obj = { 3289 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3290 "nested": { 3291 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3292 } 3293 } 3294 3295 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3296 3297 # Result: 3298 # { 3299 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3300 # "nested": { 3301 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3302 # } 3303 # } 3304 """ 3305 return LangfuseMedia.resolve_media_references( 3306 langfuse_client=self, 3307 obj=obj, 3308 resolve_with=resolve_with, 3309 max_depth=max_depth, 3310 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3311 )
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: int: The maximum depth to traverse the object. Default is 10.
- content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
3341 def get_prompt( 3342 self, 3343 name: str, 3344 *, 3345 version: Optional[int] = None, 3346 label: Optional[str] = None, 3347 type: Literal["chat", "text"] = "text", 3348 cache_ttl_seconds: Optional[int] = None, 3349 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3350 max_retries: Optional[int] = None, 3351 fetch_timeout_seconds: Optional[int] = None, 3352 ) -> PromptClient: 3353 """Get a prompt. 3354 3355 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3356 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3357 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3358 return the expired prompt as a fallback. 3359 3360 Args: 3361 name (str): The name of the prompt to retrieve. 3362 3363 Keyword Args: 3364 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3365 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3366 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3367 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3368 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3369 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3370 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3371 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3372 3373 Returns: 3374 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3375 - TextPromptClient, if type argument is 'text'. 3376 - ChatPromptClient, if type argument is 'chat'. 3377 3378 Raises: 3379 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3380 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3381 """ 3382 if self._resources is None: 3383 raise Error( 3384 "SDK is not correctly initialized. Check the init logs for more details." 3385 ) 3386 if version is not None and label is not None: 3387 raise ValueError("Cannot specify both version and label at the same time.") 3388 3389 if not name: 3390 raise ValueError("Prompt name cannot be empty.") 3391 3392 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3393 bounded_max_retries = self._get_bounded_max_retries( 3394 max_retries, default_max_retries=2, max_retries_upper_bound=4 3395 ) 3396 3397 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3398 cached_prompt = self._resources.prompt_cache.get(cache_key) 3399 3400 if cached_prompt is None or cache_ttl_seconds == 0: 3401 langfuse_logger.debug( 3402 f"Prompt '{cache_key}' not found in cache or caching disabled." 3403 ) 3404 try: 3405 return self._fetch_prompt_and_update_cache( 3406 name, 3407 version=version, 3408 label=label, 3409 ttl_seconds=cache_ttl_seconds, 3410 max_retries=bounded_max_retries, 3411 fetch_timeout_seconds=fetch_timeout_seconds, 3412 ) 3413 except Exception as e: 3414 if fallback: 3415 langfuse_logger.warning( 3416 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3417 ) 3418 3419 fallback_client_args: Dict[str, Any] = { 3420 "name": name, 3421 "prompt": fallback, 3422 "type": type, 3423 "version": version or 0, 3424 "config": {}, 3425 "labels": [label] if label else [], 3426 "tags": [], 3427 } 3428 3429 if type == "text": 3430 return TextPromptClient( 3431 prompt=Prompt_Text(**fallback_client_args), 3432 is_fallback=True, 3433 ) 3434 3435 if type == "chat": 3436 return ChatPromptClient( 3437 prompt=Prompt_Chat(**fallback_client_args), 3438 is_fallback=True, 3439 ) 3440 3441 raise e 3442 3443 if cached_prompt.is_expired(): 3444 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3445 try: 3446 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3447 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3448 3449 def refresh_task() -> None: 3450 self._fetch_prompt_and_update_cache( 3451 name, 3452 version=version, 3453 label=label, 3454 ttl_seconds=cache_ttl_seconds, 3455 max_retries=bounded_max_retries, 3456 fetch_timeout_seconds=fetch_timeout_seconds, 3457 ) 3458 3459 self._resources.prompt_cache.add_refresh_prompt_task( 3460 cache_key, 3461 refresh_task, 3462 ) 3463 langfuse_logger.debug( 3464 f"Returning stale prompt '{cache_key}' from cache." 3465 ) 3466 # return stale prompt 3467 return cached_prompt.value 3468 3469 except Exception as e: 3470 langfuse_logger.warning( 3471 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3472 ) 3473 # creation of refresh prompt task failed, return stale prompt 3474 return cached_prompt.value 3475 3476 return cached_prompt.value
Get a prompt.
This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.
Arguments:
- name (str): The name of the prompt to retrieve.
Keyword Args:
version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, theproductionlabel is returned. Specify either version or label, not both. cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:
The prompt object retrieved from the cache or directly fetched if not cached or expired of type
- TextPromptClient, if type argument is 'text'.
- ChatPromptClient, if type argument is 'chat'.
Raises:
- Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
- expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3578 def create_prompt( 3579 self, 3580 *, 3581 name: str, 3582 prompt: Union[ 3583 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3584 ], 3585 labels: List[str] = [], 3586 tags: Optional[List[str]] = None, 3587 type: Optional[Literal["chat", "text"]] = "text", 3588 config: Optional[Any] = None, 3589 commit_message: Optional[str] = None, 3590 ) -> PromptClient: 3591 """Create a new prompt in Langfuse. 3592 3593 Keyword Args: 3594 name : The name of the prompt to be created. 3595 prompt : The content of the prompt to be created. 3596 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3597 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3598 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3599 config: Additional structured data to be saved with the prompt. Defaults to None. 3600 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3601 commit_message: Optional string describing the change. 3602 3603 Returns: 3604 TextPromptClient: The prompt if type argument is 'text'. 3605 ChatPromptClient: The prompt if type argument is 'chat'. 3606 """ 3607 try: 3608 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3609 3610 if type == "chat": 3611 if not isinstance(prompt, list): 3612 raise ValueError( 3613 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3614 ) 3615 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3616 CreateChatPromptRequest( 3617 name=name, 3618 prompt=cast(Any, prompt), 3619 labels=labels, 3620 tags=tags, 3621 config=config or {}, 3622 commit_message=commit_message, 3623 type=CreateChatPromptType.CHAT, 3624 ) 3625 ) 3626 server_prompt = self.api.prompts.create(request=request) 3627 3628 if self._resources is not None: 3629 self._resources.prompt_cache.invalidate(name) 3630 3631 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3632 3633 if not isinstance(prompt, str): 3634 raise ValueError("For 'text' type, 'prompt' must be a string.") 3635 3636 request = CreateTextPromptRequest( 3637 name=name, 3638 prompt=prompt, 3639 labels=labels, 3640 tags=tags, 3641 config=config or {}, 3642 commit_message=commit_message, 3643 ) 3644 3645 server_prompt = self.api.prompts.create(request=request) 3646 3647 if self._resources is not None: 3648 self._resources.prompt_cache.invalidate(name) 3649 3650 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3651 3652 except Error as e: 3653 handle_fern_exception(e) 3654 raise e
Create a new prompt in Langfuse.
Keyword Args:
name : The name of the prompt to be created. prompt : The content of the prompt to be created. is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. config: Additional structured data to be saved with the prompt. Defaults to None. type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". commit_message: Optional string describing the change.
Returns:
TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.
3656 def update_prompt( 3657 self, 3658 *, 3659 name: str, 3660 version: int, 3661 new_labels: List[str] = [], 3662 ) -> Any: 3663 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3664 3665 Args: 3666 name (str): The name of the prompt to update. 3667 version (int): The version number of the prompt to update. 3668 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3669 3670 Returns: 3671 Prompt: The updated prompt from the Langfuse API. 3672 3673 """ 3674 updated_prompt = self.api.prompt_version.update( 3675 name=self._url_encode(name), 3676 version=version, 3677 new_labels=new_labels, 3678 ) 3679 3680 if self._resources is not None: 3681 self._resources.prompt_cache.invalidate(name) 3682 3683 return updated_prompt
Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
Arguments:
- name (str): The name of the prompt to update.
- version (int): The version number of the prompt to update.
- new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:
Prompt: The updated prompt from the Langfuse API.
3698 def clear_prompt_cache(self) -> None: 3699 """Clear the entire prompt cache, removing all cached prompts. 3700 3701 This method is useful when you want to force a complete refresh of all 3702 cached prompts, for example after major updates or when you need to 3703 ensure the latest versions are fetched from the server. 3704 """ 3705 if self._resources is not None: 3706 self._resources.prompt_cache.clear()
Clear the entire prompt cache, removing all cached prompts.
This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.
62def get_client(*, public_key: Optional[str] = None) -> Langfuse: 63 """Get or create a Langfuse client instance. 64 65 Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, 66 providing a public_key is required. Multi-project support is experimental - see Langfuse docs. 67 68 Behavior: 69 - Single project: Returns existing client or creates new one 70 - Multi-project: Requires public_key to return specific client 71 - No public_key in multi-project: Returns disabled client to prevent data leakage 72 73 The function uses a singleton pattern per public_key to conserve resources and maintain state. 74 75 Args: 76 public_key (Optional[str]): Project identifier 77 - With key: Returns client for that project 78 - Without key: Returns single client or disabled client if multiple exist 79 80 Returns: 81 Langfuse: Client instance in one of three states: 82 1. Client for specified public_key 83 2. Default client for single-project setup 84 3. Disabled client when multiple projects exist without key 85 86 Security: 87 Disables tracing when multiple projects exist without explicit key to prevent 88 cross-project data leakage. Multi-project setups are experimental. 89 90 Example: 91 ```python 92 # Single project 93 client = get_client() # Default client 94 95 # In multi-project usage: 96 client_a = get_client(public_key="project_a_key") # Returns project A's client 97 client_b = get_client(public_key="project_b_key") # Returns project B's client 98 99 # Without specific key in multi-project setup: 100 client = get_client() # Returns disabled client for safety 101 ``` 102 """ 103 with LangfuseResourceManager._lock: 104 active_instances = LangfuseResourceManager._instances 105 106 # If no explicit public_key provided, check execution context 107 if not public_key: 108 public_key = _current_public_key.get(None) 109 110 if not public_key: 111 if len(active_instances) == 0: 112 # No clients initialized yet, create default instance 113 return Langfuse() 114 115 if len(active_instances) == 1: 116 # Only one client exists, safe to use without specifying key 117 instance = list(active_instances.values())[0] 118 119 # Initialize with the credentials bound to the instance 120 # This is important if the original instance was instantiated 121 # via constructor arguments 122 return _create_client_from_instance(instance) 123 124 else: 125 # Multiple clients exist but no key specified - disable tracing 126 # to prevent cross-project data leakage 127 langfuse_logger.warning( 128 "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage." 129 ) 130 return Langfuse( 131 tracing_enabled=False, public_key="fake", secret_key="fake" 132 ) 133 134 else: 135 # Specific key provided, look up existing instance 136 target_instance: Optional[LangfuseResourceManager] = active_instances.get( 137 public_key, None 138 ) 139 140 if target_instance is None: 141 # No instance found with this key - client not initialized properly 142 langfuse_logger.warning( 143 f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function." 144 ) 145 return Langfuse( 146 tracing_enabled=False, public_key="fake", secret_key="fake" 147 ) 148 149 # target_instance is guaranteed to be not None at this point 150 return _create_client_from_instance(target_instance, public_key)
Get or create a Langfuse client instance.
Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
Behavior:
- Single project: Returns existing client or creates new one
- Multi-project: Requires public_key to return specific client
- No public_key in multi-project: Returns disabled client to prevent data leakage
The function uses a singleton pattern per public_key to conserve resources and maintain state.
Arguments:
- public_key (Optional[str]): Project identifier
- With key: Returns client for that project
- Without key: Returns single client or disabled client if multiple exist
Returns:
Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key
Security:
Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.
Example:
# Single project client = get_client() # Default client # In multi-project usage: client_a = get_client(public_key="project_a_key") # Returns project A's client client_b = get_client(public_key="project_b_key") # Returns project B's client # Without specific key in multi-project setup: client = get_client() # Returns disabled client for safety
90 def observe( 91 self, 92 func: Optional[F] = None, 93 *, 94 name: Optional[str] = None, 95 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 96 capture_input: Optional[bool] = None, 97 capture_output: Optional[bool] = None, 98 transform_to_string: Optional[Callable[[Iterable], str]] = None, 99 ) -> Union[F, Callable[[F], F]]: 100 """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions. 101 102 This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates 103 spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator 104 intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints. 105 106 Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, 107 enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details. 108 109 Args: 110 func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None. 111 name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used. 112 as_type (Optional[Literal]): Set the observation type. Supported values: 113 "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". 114 Observation types are highlighted in the Langfuse UI for filtering and visualization. 115 The types "generation" and "embedding" create a span on which additional attributes such as model metrics 116 can be set. 117 118 Returns: 119 Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans. 120 121 Example: 122 For general function tracing with automatic naming: 123 ```python 124 @observe() 125 def process_user_request(user_id, query): 126 # Function is automatically traced with name "process_user_request" 127 return get_response(query) 128 ``` 129 130 For language model generation tracking: 131 ```python 132 @observe(name="answer-generation", as_type="generation") 133 async def generate_answer(query): 134 # Creates a generation-type span with extended LLM metrics 135 response = await openai.chat.completions.create( 136 model="gpt-4", 137 messages=[{"role": "user", "content": query}] 138 ) 139 return response.choices[0].message.content 140 ``` 141 142 For trace context propagation between functions: 143 ```python 144 @observe() 145 def main_process(): 146 # Parent span is created 147 return sub_process() # Child span automatically connected to parent 148 149 @observe() 150 def sub_process(): 151 # Automatically becomes a child span of main_process 152 return "result" 153 ``` 154 155 Raises: 156 Exception: Propagates any exceptions from the wrapped function after logging them in the trace. 157 158 Notes: 159 - The decorator preserves the original function's signature, docstring, and return type. 160 - Proper parent-child relationships between spans are automatically maintained. 161 - Special keyword arguments can be passed to control tracing: 162 - langfuse_trace_id: Explicitly set the trace ID for this function call 163 - langfuse_parent_observation_id: Explicitly set the parent span ID 164 - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist) 165 - For async functions, the decorator returns an async function wrapper. 166 - For sync functions, the decorator returns a synchronous wrapper. 167 """ 168 valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent)) 169 if as_type is not None and as_type not in valid_types: 170 self._log.warning( 171 f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'." 172 ) 173 as_type = "span" 174 175 function_io_capture_enabled = os.environ.get( 176 LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True" 177 ).lower() not in ("false", "0") 178 179 should_capture_input = ( 180 capture_input if capture_input is not None else function_io_capture_enabled 181 ) 182 183 should_capture_output = ( 184 capture_output 185 if capture_output is not None 186 else function_io_capture_enabled 187 ) 188 189 def decorator(func: F) -> F: 190 return ( 191 self._async_observe( 192 func, 193 name=name, 194 as_type=as_type, 195 capture_input=should_capture_input, 196 capture_output=should_capture_output, 197 transform_to_string=transform_to_string, 198 ) 199 if asyncio.iscoroutinefunction(func) 200 else self._sync_observe( 201 func, 202 name=name, 203 as_type=as_type, 204 capture_input=should_capture_input, 205 capture_output=should_capture_output, 206 transform_to_string=transform_to_string, 207 ) 208 ) 209 210 """Handle decorator with or without parentheses. 211 212 This logic enables the decorator to work both with and without parentheses: 213 - @observe - Python passes the function directly to the decorator 214 - @observe() - Python calls the decorator first, which must return a function decorator 215 216 When called without arguments (@observe), the func parameter contains the function to decorate, 217 so we directly apply the decorator to it. When called with parentheses (@observe()), 218 func is None, so we return the decorator function itself for Python to apply in the next step. 219 """ 220 if func is None: 221 return decorator 222 else: 223 return decorator(func)
Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
Arguments:
- func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
- name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
- as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:
Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
Example:
For general function tracing with automatic naming:
@observe() def process_user_request(user_id, query): # Function is automatically traced with name "process_user_request" return get_response(query)For language model generation tracking:
@observe(name="answer-generation", as_type="generation") async def generate_answer(query): # Creates a generation-type span with extended LLM metrics response = await openai.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": query}] ) return response.choices[0].message.contentFor trace context propagation between functions:
@observe() def main_process(): # Parent span is created return sub_process() # Child span automatically connected to parent @observe() def sub_process(): # Automatically becomes a child span of main_process return "result"
Raises:
- Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
- The decorator preserves the original function's signature, docstring, and return type.
- Proper parent-child relationships between spans are automatically maintained.
- Special keyword arguments can be passed to control tracing:
- langfuse_trace_id: Explicitly set the trace ID for this function call
- langfuse_parent_observation_id: Explicitly set the parent span ID
- langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
- For async functions, the decorator returns an async function wrapper.
- For sync functions, the decorator returns a synchronous wrapper.
76def propagate_attributes( 77 *, 78 user_id: Optional[str] = None, 79 session_id: Optional[str] = None, 80 metadata: Optional[Dict[str, str]] = None, 81 version: Optional[str] = None, 82 tags: Optional[List[str]] = None, 83 trace_name: Optional[str] = None, 84 as_baggage: bool = False, 85) -> _AgnosticContextManager[Any]: 86 """Propagate trace-level attributes to all spans created within this context. 87 88 This context manager sets attributes on the currently active span AND automatically 89 propagates them to all new child spans created within the context. This is the 90 recommended way to set trace-level attributes like user_id, session_id, and metadata 91 dimensions that should be consistently applied across all observations in a trace. 92 93 **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the 94 currently active span and spans created after entering this context will have these 95 attributes. Pre-existing spans will NOT be retroactively updated. 96 97 **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id, 98 filtering by session_id) only include observations that have the attribute set. 99 If you call `propagate_attributes` late in your workflow, earlier spans won't be 100 included in aggregations for that attribute. 101 102 Args: 103 user_id: User identifier to associate with all spans in this context. 104 Must be US-ASCII string, â¤200 characters. Use this to track which user 105 generated each trace and enable e.g. per-user cost/performance analysis. 106 session_id: Session identifier to associate with all spans in this context. 107 Must be US-ASCII string, â¤200 characters. Use this to group related traces 108 within a user session (e.g., a conversation thread, multi-turn interaction). 109 metadata: Additional key-value metadata to propagate to all spans. 110 - Keys and values must be US-ASCII strings 111 - All values must be â¤200 characters 112 - Use for dimensions like internal correlating identifiers 113 - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning) 114 version: Version identfier for parts of your application that are independently versioned, e.g. agents 115 tags: List of tags to categorize the group of observations 116 trace_name: Name to assign to the trace. Must be US-ASCII string, â¤200 characters. 117 Use this to set a consistent trace name for all spans created within this context. 118 as_baggage: If True, propagates attributes using OpenTelemetry baggage for 119 cross-process/service propagation. **Security warning**: When enabled, 120 attribute values are added to HTTP headers on ALL outbound requests. 121 Only enable if values are safe to transmit via HTTP headers and you need 122 cross-service tracing. Default: False. 123 124 Returns: 125 Context manager that propagates attributes to all child spans. 126 127 Example: 128 Basic usage with user and session tracking: 129 130 ```python 131 from langfuse import Langfuse 132 133 langfuse = Langfuse() 134 135 # Set attributes early in the trace 136 with langfuse.start_as_current_observation(name="user_workflow") as span: 137 with langfuse.propagate_attributes( 138 user_id="user_123", 139 session_id="session_abc", 140 metadata={"experiment": "variant_a", "environment": "production"} 141 ): 142 # All spans created here will have user_id, session_id, and metadata 143 with langfuse.start_observation(name="llm_call") as llm_span: 144 # This span inherits: user_id, session_id, experiment, environment 145 ... 146 147 with langfuse.start_generation(name="completion") as gen: 148 # This span also inherits all attributes 149 ... 150 ``` 151 152 Late propagation (anti-pattern): 153 154 ```python 155 with langfuse.start_as_current_observation(name="workflow") as span: 156 # These spans WON'T have user_id 157 early_span = langfuse.start_observation(name="early_work") 158 early_span.end() 159 160 # Set attributes in the middle 161 with langfuse.propagate_attributes(user_id="user_123"): 162 # Only spans created AFTER this point will have user_id 163 late_span = langfuse.start_observation(name="late_work") 164 late_span.end() 165 166 # Result: Aggregations by user_id will miss "early_work" span 167 ``` 168 169 Cross-service propagation with baggage (advanced): 170 171 ```python 172 # Service A - originating service 173 with langfuse.start_as_current_observation(name="api_request"): 174 with langfuse.propagate_attributes( 175 user_id="user_123", 176 session_id="session_abc", 177 as_baggage=True # Propagate via HTTP headers 178 ): 179 # Make HTTP request to Service B 180 response = requests.get("https://service-b.example.com/api") 181 # user_id and session_id are now in HTTP headers 182 183 # Service B - downstream service 184 # OpenTelemetry will automatically extract baggage from HTTP headers 185 # and propagate to spans in Service B 186 ``` 187 188 Note: 189 - **Validation**: All attribute values (user_id, session_id, metadata values) 190 must be strings â¤200 characters. Invalid values will be dropped with a 191 warning logged. Ensure values meet constraints before calling. 192 - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood, 193 making it compatible with other OTel-instrumented libraries. 194 195 Raises: 196 No exceptions are raised. Invalid values are logged as warnings and dropped. 197 """ 198 return _propagate_attributes( 199 user_id=user_id, 200 session_id=session_id, 201 metadata=metadata, 202 version=version, 203 tags=tags, 204 trace_name=trace_name, 205 as_baggage=as_baggage, 206 )
Propagate trace-level attributes to all spans created within this context.
This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.
IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.
Why this matters: Langfuse aggregation queries (e.g., total cost by user_id,
filtering by session_id) only include observations that have the attribute set.
If you call propagate_attributes late in your workflow, earlier spans won't be
included in aggregations for that attribute.
Arguments:
- user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, â¤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
- session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, â¤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
- metadata: Additional key-value metadata to propagate to all spans.
- Keys and values must be US-ASCII strings
- All values must be â¤200 characters
- Use for dimensions like internal correlating identifiers
- AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
- version: Version identfier for parts of your application that are independently versioned, e.g. agents
- tags: List of tags to categorize the group of observations
- trace_name: Name to assign to the trace. Must be US-ASCII string, â¤200 characters. Use this to set a consistent trace name for all spans created within this context.
- as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:
Context manager that propagates attributes to all child spans.
Example:
Basic usage with user and session tracking:
from langfuse import Langfuse langfuse = Langfuse() # Set attributes early in the trace with langfuse.start_as_current_observation(name="user_workflow") as span: with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", metadata={"experiment": "variant_a", "environment": "production"} ): # All spans created here will have user_id, session_id, and metadata with langfuse.start_observation(name="llm_call") as llm_span: # This span inherits: user_id, session_id, experiment, environment ... with langfuse.start_generation(name="completion") as gen: # This span also inherits all attributes ...Late propagation (anti-pattern):
with langfuse.start_as_current_observation(name="workflow") as span: # These spans WON'T have user_id early_span = langfuse.start_observation(name="early_work") early_span.end() # Set attributes in the middle with langfuse.propagate_attributes(user_id="user_123"): # Only spans created AFTER this point will have user_id late_span = langfuse.start_observation(name="late_work") late_span.end() # Result: Aggregations by user_id will miss "early_work" spanCross-service propagation with baggage (advanced):
# Service A - originating service with langfuse.start_as_current_observation(name="api_request"): with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", as_baggage=True # Propagate via HTTP headers ): # Make HTTP request to Service B response = requests.get("https://service-b.example.com/api") # user_id and session_id are now in HTTP headers # Service B - downstream service # OpenTelemetry will automatically extract baggage from HTTP headers # and propagate to spans in Service B
Note:
- Validation: All attribute values (user_id, session_id, metadata values) must be strings â¤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
- OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
- No exceptions are raised. Invalid values are logged as warnings and dropped.
1236class LangfuseSpan(LangfuseObservationWrapper): 1237 """Standard span implementation for general operations in Langfuse. 1238 1239 This class represents a general-purpose span that can be used to trace 1240 any operation in your application. It extends the base LangfuseObservationWrapper 1241 with specific methods for creating child spans, generations, and updating 1242 span-specific attributes. If possible, use a more specific type for 1243 better observability and insights. 1244 """ 1245 1246 def __init__( 1247 self, 1248 *, 1249 otel_span: otel_trace_api.Span, 1250 langfuse_client: "Langfuse", 1251 input: Optional[Any] = None, 1252 output: Optional[Any] = None, 1253 metadata: Optional[Any] = None, 1254 environment: Optional[str] = None, 1255 version: Optional[str] = None, 1256 level: Optional[SpanLevel] = None, 1257 status_message: Optional[str] = None, 1258 ): 1259 """Initialize a new LangfuseSpan. 1260 1261 Args: 1262 otel_span: The OpenTelemetry span to wrap 1263 langfuse_client: Reference to the parent Langfuse client 1264 input: Input data for the span (any JSON-serializable object) 1265 output: Output data from the span (any JSON-serializable object) 1266 metadata: Additional metadata to associate with the span 1267 environment: The tracing environment 1268 version: Version identifier for the code or component 1269 level: Importance level of the span (info, warning, error) 1270 status_message: Optional status message for the span 1271 """ 1272 super().__init__( 1273 otel_span=otel_span, 1274 as_type="span", 1275 langfuse_client=langfuse_client, 1276 input=input, 1277 output=output, 1278 metadata=metadata, 1279 environment=environment, 1280 version=version, 1281 level=level, 1282 status_message=status_message, 1283 )
Standard span implementation for general operations in Langfuse.
This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.
1246 def __init__( 1247 self, 1248 *, 1249 otel_span: otel_trace_api.Span, 1250 langfuse_client: "Langfuse", 1251 input: Optional[Any] = None, 1252 output: Optional[Any] = None, 1253 metadata: Optional[Any] = None, 1254 environment: Optional[str] = None, 1255 version: Optional[str] = None, 1256 level: Optional[SpanLevel] = None, 1257 status_message: Optional[str] = None, 1258 ): 1259 """Initialize a new LangfuseSpan. 1260 1261 Args: 1262 otel_span: The OpenTelemetry span to wrap 1263 langfuse_client: Reference to the parent Langfuse client 1264 input: Input data for the span (any JSON-serializable object) 1265 output: Output data from the span (any JSON-serializable object) 1266 metadata: Additional metadata to associate with the span 1267 environment: The tracing environment 1268 version: Version identifier for the code or component 1269 level: Importance level of the span (info, warning, error) 1270 status_message: Optional status message for the span 1271 """ 1272 super().__init__( 1273 otel_span=otel_span, 1274 as_type="span", 1275 langfuse_client=langfuse_client, 1276 input=input, 1277 output=output, 1278 metadata=metadata, 1279 environment=environment, 1280 version=version, 1281 level=level, 1282 status_message=status_message, 1283 )
Initialize a new LangfuseSpan.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the span (any JSON-serializable object)
- output: Output data from the span (any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- environment: The tracing environment
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
1286class LangfuseGeneration(LangfuseObservationWrapper): 1287 """Specialized span implementation for AI model generations in Langfuse. 1288 1289 This class represents a generation span specifically designed for tracking 1290 AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized 1291 attributes for model details, token usage, and costs. 1292 """ 1293 1294 def __init__( 1295 self, 1296 *, 1297 otel_span: otel_trace_api.Span, 1298 langfuse_client: "Langfuse", 1299 input: Optional[Any] = None, 1300 output: Optional[Any] = None, 1301 metadata: Optional[Any] = None, 1302 environment: Optional[str] = None, 1303 version: Optional[str] = None, 1304 level: Optional[SpanLevel] = None, 1305 status_message: Optional[str] = None, 1306 completion_start_time: Optional[datetime] = None, 1307 model: Optional[str] = None, 1308 model_parameters: Optional[Dict[str, MapValue]] = None, 1309 usage_details: Optional[Dict[str, int]] = None, 1310 cost_details: Optional[Dict[str, float]] = None, 1311 prompt: Optional[PromptClient] = None, 1312 ): 1313 """Initialize a new LangfuseGeneration span. 1314 1315 Args: 1316 otel_span: The OpenTelemetry span to wrap 1317 langfuse_client: Reference to the parent Langfuse client 1318 input: Input data for the generation (e.g., prompts) 1319 output: Output from the generation (e.g., completions) 1320 metadata: Additional metadata to associate with the generation 1321 environment: The tracing environment 1322 version: Version identifier for the model or component 1323 level: Importance level of the generation (info, warning, error) 1324 status_message: Optional status message for the generation 1325 completion_start_time: When the model started generating the response 1326 model: Name/identifier of the AI model used (e.g., "gpt-4") 1327 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1328 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1329 cost_details: Cost information for the model call 1330 prompt: Associated prompt template from Langfuse prompt management 1331 """ 1332 super().__init__( 1333 as_type="generation", 1334 otel_span=otel_span, 1335 langfuse_client=langfuse_client, 1336 input=input, 1337 output=output, 1338 metadata=metadata, 1339 environment=environment, 1340 version=version, 1341 level=level, 1342 status_message=status_message, 1343 completion_start_time=completion_start_time, 1344 model=model, 1345 model_parameters=model_parameters, 1346 usage_details=usage_details, 1347 cost_details=cost_details, 1348 prompt=prompt, 1349 )
Specialized span implementation for AI model generations in Langfuse.
This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.
1294 def __init__( 1295 self, 1296 *, 1297 otel_span: otel_trace_api.Span, 1298 langfuse_client: "Langfuse", 1299 input: Optional[Any] = None, 1300 output: Optional[Any] = None, 1301 metadata: Optional[Any] = None, 1302 environment: Optional[str] = None, 1303 version: Optional[str] = None, 1304 level: Optional[SpanLevel] = None, 1305 status_message: Optional[str] = None, 1306 completion_start_time: Optional[datetime] = None, 1307 model: Optional[str] = None, 1308 model_parameters: Optional[Dict[str, MapValue]] = None, 1309 usage_details: Optional[Dict[str, int]] = None, 1310 cost_details: Optional[Dict[str, float]] = None, 1311 prompt: Optional[PromptClient] = None, 1312 ): 1313 """Initialize a new LangfuseGeneration span. 1314 1315 Args: 1316 otel_span: The OpenTelemetry span to wrap 1317 langfuse_client: Reference to the parent Langfuse client 1318 input: Input data for the generation (e.g., prompts) 1319 output: Output from the generation (e.g., completions) 1320 metadata: Additional metadata to associate with the generation 1321 environment: The tracing environment 1322 version: Version identifier for the model or component 1323 level: Importance level of the generation (info, warning, error) 1324 status_message: Optional status message for the generation 1325 completion_start_time: When the model started generating the response 1326 model: Name/identifier of the AI model used (e.g., "gpt-4") 1327 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1328 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1329 cost_details: Cost information for the model call 1330 prompt: Associated prompt template from Langfuse prompt management 1331 """ 1332 super().__init__( 1333 as_type="generation", 1334 otel_span=otel_span, 1335 langfuse_client=langfuse_client, 1336 input=input, 1337 output=output, 1338 metadata=metadata, 1339 environment=environment, 1340 version=version, 1341 level=level, 1342 status_message=status_message, 1343 completion_start_time=completion_start_time, 1344 model=model, 1345 model_parameters=model_parameters, 1346 usage_details=usage_details, 1347 cost_details=cost_details, 1348 prompt=prompt, 1349 )
Initialize a new LangfuseGeneration span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the generation (e.g., prompts)
- output: Output from the generation (e.g., completions)
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
1352class LangfuseEvent(LangfuseObservationWrapper): 1353 """Specialized span implementation for Langfuse Events.""" 1354 1355 def __init__( 1356 self, 1357 *, 1358 otel_span: otel_trace_api.Span, 1359 langfuse_client: "Langfuse", 1360 input: Optional[Any] = None, 1361 output: Optional[Any] = None, 1362 metadata: Optional[Any] = None, 1363 environment: Optional[str] = None, 1364 version: Optional[str] = None, 1365 level: Optional[SpanLevel] = None, 1366 status_message: Optional[str] = None, 1367 ): 1368 """Initialize a new LangfuseEvent span. 1369 1370 Args: 1371 otel_span: The OpenTelemetry span to wrap 1372 langfuse_client: Reference to the parent Langfuse client 1373 input: Input data for the event 1374 output: Output from the event 1375 metadata: Additional metadata to associate with the generation 1376 environment: The tracing environment 1377 version: Version identifier for the model or component 1378 level: Importance level of the generation (info, warning, error) 1379 status_message: Optional status message for the generation 1380 """ 1381 super().__init__( 1382 otel_span=otel_span, 1383 as_type="event", 1384 langfuse_client=langfuse_client, 1385 input=input, 1386 output=output, 1387 metadata=metadata, 1388 environment=environment, 1389 version=version, 1390 level=level, 1391 status_message=status_message, 1392 ) 1393 1394 def update( 1395 self, 1396 *, 1397 name: Optional[str] = None, 1398 input: Optional[Any] = None, 1399 output: Optional[Any] = None, 1400 metadata: Optional[Any] = None, 1401 version: Optional[str] = None, 1402 level: Optional[SpanLevel] = None, 1403 status_message: Optional[str] = None, 1404 completion_start_time: Optional[datetime] = None, 1405 model: Optional[str] = None, 1406 model_parameters: Optional[Dict[str, MapValue]] = None, 1407 usage_details: Optional[Dict[str, int]] = None, 1408 cost_details: Optional[Dict[str, float]] = None, 1409 prompt: Optional[PromptClient] = None, 1410 **kwargs: Any, 1411 ) -> "LangfuseEvent": 1412 """Update is not allowed for LangfuseEvent because events cannot be updated. 1413 1414 This method logs a warning and returns self without making changes. 1415 1416 Returns: 1417 self: Returns the unchanged LangfuseEvent instance 1418 """ 1419 langfuse_logger.warning( 1420 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1421 ) 1422 return self
Specialized span implementation for Langfuse Events.
1355 def __init__( 1356 self, 1357 *, 1358 otel_span: otel_trace_api.Span, 1359 langfuse_client: "Langfuse", 1360 input: Optional[Any] = None, 1361 output: Optional[Any] = None, 1362 metadata: Optional[Any] = None, 1363 environment: Optional[str] = None, 1364 version: Optional[str] = None, 1365 level: Optional[SpanLevel] = None, 1366 status_message: Optional[str] = None, 1367 ): 1368 """Initialize a new LangfuseEvent span. 1369 1370 Args: 1371 otel_span: The OpenTelemetry span to wrap 1372 langfuse_client: Reference to the parent Langfuse client 1373 input: Input data for the event 1374 output: Output from the event 1375 metadata: Additional metadata to associate with the generation 1376 environment: The tracing environment 1377 version: Version identifier for the model or component 1378 level: Importance level of the generation (info, warning, error) 1379 status_message: Optional status message for the generation 1380 """ 1381 super().__init__( 1382 otel_span=otel_span, 1383 as_type="event", 1384 langfuse_client=langfuse_client, 1385 input=input, 1386 output=output, 1387 metadata=metadata, 1388 environment=environment, 1389 version=version, 1390 level=level, 1391 status_message=status_message, 1392 )
Initialize a new LangfuseEvent span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the event
- output: Output from the event
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
1394 def update( 1395 self, 1396 *, 1397 name: Optional[str] = None, 1398 input: Optional[Any] = None, 1399 output: Optional[Any] = None, 1400 metadata: Optional[Any] = None, 1401 version: Optional[str] = None, 1402 level: Optional[SpanLevel] = None, 1403 status_message: Optional[str] = None, 1404 completion_start_time: Optional[datetime] = None, 1405 model: Optional[str] = None, 1406 model_parameters: Optional[Dict[str, MapValue]] = None, 1407 usage_details: Optional[Dict[str, int]] = None, 1408 cost_details: Optional[Dict[str, float]] = None, 1409 prompt: Optional[PromptClient] = None, 1410 **kwargs: Any, 1411 ) -> "LangfuseEvent": 1412 """Update is not allowed for LangfuseEvent because events cannot be updated. 1413 1414 This method logs a warning and returns self without making changes. 1415 1416 Returns: 1417 self: Returns the unchanged LangfuseEvent instance 1418 """ 1419 langfuse_logger.warning( 1420 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1421 ) 1422 return self
Update is not allowed for LangfuseEvent because events cannot be updated.
This method logs a warning and returns self without making changes.
Returns:
self: Returns the unchanged LangfuseEvent instance
28class LangfuseOtelSpanAttributes: 29 # Langfuse-Trace attributes 30 TRACE_NAME = "langfuse.trace.name" 31 TRACE_USER_ID = "user.id" 32 TRACE_SESSION_ID = "session.id" 33 TRACE_TAGS = "langfuse.trace.tags" 34 TRACE_PUBLIC = "langfuse.trace.public" 35 TRACE_METADATA = "langfuse.trace.metadata" 36 TRACE_INPUT = "langfuse.trace.input" 37 TRACE_OUTPUT = "langfuse.trace.output" 38 39 # Langfuse-observation attributes 40 OBSERVATION_TYPE = "langfuse.observation.type" 41 OBSERVATION_METADATA = "langfuse.observation.metadata" 42 OBSERVATION_LEVEL = "langfuse.observation.level" 43 OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message" 44 OBSERVATION_INPUT = "langfuse.observation.input" 45 OBSERVATION_OUTPUT = "langfuse.observation.output" 46 47 # Langfuse-observation of type Generation attributes 48 OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time" 49 OBSERVATION_MODEL = "langfuse.observation.model.name" 50 OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters" 51 OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details" 52 OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details" 53 OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name" 54 OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version" 55 56 # General 57 ENVIRONMENT = "langfuse.environment" 58 RELEASE = "langfuse.release" 59 VERSION = "langfuse.version" 60 61 # Internal 62 AS_ROOT = "langfuse.internal.as_root" 63 64 # Experiments 65 EXPERIMENT_ID = "langfuse.experiment.id" 66 EXPERIMENT_NAME = "langfuse.experiment.name" 67 EXPERIMENT_DESCRIPTION = "langfuse.experiment.description" 68 EXPERIMENT_METADATA = "langfuse.experiment.metadata" 69 EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id" 70 EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id" 71 EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output" 72 EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata" 73 EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
1425class LangfuseAgent(LangfuseObservationWrapper): 1426 """Agent observation for reasoning blocks that act on tools using LLM guidance.""" 1427 1428 def __init__(self, **kwargs: Any) -> None: 1429 """Initialize a new LangfuseAgent span.""" 1430 kwargs["as_type"] = "agent" 1431 super().__init__(**kwargs)
Agent observation for reasoning blocks that act on tools using LLM guidance.
1434class LangfuseTool(LangfuseObservationWrapper): 1435 """Tool observation representing external tool calls, e.g., calling a weather API.""" 1436 1437 def __init__(self, **kwargs: Any) -> None: 1438 """Initialize a new LangfuseTool span.""" 1439 kwargs["as_type"] = "tool" 1440 super().__init__(**kwargs)
Tool observation representing external tool calls, e.g., calling a weather API.
1443class LangfuseChain(LangfuseObservationWrapper): 1444 """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.""" 1445 1446 def __init__(self, **kwargs: Any) -> None: 1447 """Initialize a new LangfuseChain span.""" 1448 kwargs["as_type"] = "chain" 1449 super().__init__(**kwargs)
Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.
1461class LangfuseEmbedding(LangfuseObservationWrapper): 1462 """Embedding observation for LLM embedding calls, typically used before retrieval.""" 1463 1464 def __init__(self, **kwargs: Any) -> None: 1465 """Initialize a new LangfuseEmbedding span.""" 1466 kwargs["as_type"] = "embedding" 1467 super().__init__(**kwargs)
Embedding observation for LLM embedding calls, typically used before retrieval.
1470class LangfuseEvaluator(LangfuseObservationWrapper): 1471 """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.""" 1472 1473 def __init__(self, **kwargs: Any) -> None: 1474 """Initialize a new LangfuseEvaluator span.""" 1475 kwargs["as_type"] = "evaluator" 1476 super().__init__(**kwargs)
Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.
1452class LangfuseRetriever(LangfuseObservationWrapper): 1453 """Retriever observation for data retrieval steps, e.g. vector store or database queries.""" 1454 1455 def __init__(self, **kwargs: Any) -> None: 1456 """Initialize a new LangfuseRetriever span.""" 1457 kwargs["as_type"] = "retriever" 1458 super().__init__(**kwargs)
Retriever observation for data retrieval steps, e.g. vector store or database queries.
1479class LangfuseGuardrail(LangfuseObservationWrapper): 1480 """Guardrail observation for protection e.g. against jailbreaks or offensive content.""" 1481 1482 def __init__(self, **kwargs: Any) -> None: 1483 """Initialize a new LangfuseGuardrail span.""" 1484 kwargs["as_type"] = "guardrail" 1485 super().__init__(**kwargs)
Guardrail observation for protection e.g. against jailbreaks or offensive content.
93class Evaluation: 94 """Represents an evaluation result for an experiment item or an entire experiment run. 95 96 This class provides a strongly-typed way to create evaluation results in evaluator functions. 97 Users must use keyword arguments when instantiating this class. 98 99 Attributes: 100 name: Unique identifier for the evaluation metric. Should be descriptive 101 and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). 102 Used for aggregation and comparison across experiment runs. 103 value: The evaluation score or result. Can be: 104 - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) 105 - String: For categorical results like "positive", "negative", "neutral" 106 - Boolean: For binary assessments like "passes_safety_check" 107 comment: Optional human-readable explanation of the evaluation result. 108 Useful for providing context, explaining scoring rationale, or noting 109 special conditions. Displayed in Langfuse UI for interpretability. 110 metadata: Optional structured metadata about the evaluation process. 111 Can include confidence scores, intermediate calculations, model versions, 112 or any other relevant technical details. 113 data_type: Optional score data type. Required if value is not NUMERIC. 114 One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC. 115 config_id: Optional Langfuse score config ID. 116 117 Examples: 118 Basic accuracy evaluation: 119 ```python 120 from langfuse import Evaluation 121 122 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 123 if not expected_output: 124 return Evaluation(name="accuracy", value=0, comment="No expected output") 125 126 is_correct = output.strip().lower() == expected_output.strip().lower() 127 return Evaluation( 128 name="accuracy", 129 value=1.0 if is_correct else 0.0, 130 comment="Correct answer" if is_correct else "Incorrect answer" 131 ) 132 ``` 133 134 Multi-metric evaluator: 135 ```python 136 def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): 137 return [ 138 Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), 139 Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), 140 Evaluation( 141 name="quality", 142 value=0.85, 143 comment="High quality response", 144 metadata={"confidence": 0.92, "model": "gpt-4"} 145 ) 146 ] 147 ``` 148 149 Categorical evaluation: 150 ```python 151 def sentiment_evaluator(*, input, output, **kwargs): 152 sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" 153 return Evaluation( 154 name="sentiment", 155 value=sentiment, 156 comment=f"Response expresses {sentiment} sentiment", 157 data_type="CATEGORICAL" 158 ) 159 ``` 160 161 Failed evaluation with error handling: 162 ```python 163 def external_api_evaluator(*, input, output, **kwargs): 164 try: 165 score = external_api.evaluate(output) 166 return Evaluation(name="external_score", value=score) 167 except Exception as e: 168 return Evaluation( 169 name="external_score", 170 value=0, 171 comment=f"API unavailable: {e}", 172 metadata={"error": str(e), "retry_count": 3} 173 ) 174 ``` 175 176 Note: 177 All arguments must be passed as keywords. Positional arguments are not allowed 178 to ensure code clarity and prevent errors from argument reordering. 179 """ 180 181 def __init__( 182 self, 183 *, 184 name: str, 185 value: Union[int, float, str, bool], 186 comment: Optional[str] = None, 187 metadata: Optional[Dict[str, Any]] = None, 188 data_type: Optional[ScoreDataType] = None, 189 config_id: Optional[str] = None, 190 ): 191 """Initialize an Evaluation with the provided data. 192 193 Args: 194 name: Unique identifier for the evaluation metric. 195 value: The evaluation score or result. 196 comment: Optional human-readable explanation of the result. 197 metadata: Optional structured metadata about the evaluation process. 198 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 199 config_id: Optional Langfuse score config ID. 200 201 Note: 202 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 203 """ 204 self.name = name 205 self.value = value 206 self.comment = comment 207 self.metadata = metadata 208 self.data_type = data_type 209 self.config_id = config_id
Represents an evaluation result for an experiment item or an entire experiment run.
This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.
Attributes:
- name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
- value: The evaluation score or result. Can be:
- Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
- String: For categorical results like "positive", "negative", "neutral"
- Boolean: For binary assessments like "passes_safety_check"
- comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
- metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
- data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
- config_id: Optional Langfuse score config ID.
Examples:
Basic accuracy evaluation:
from langfuse import Evaluation def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if not expected_output: return Evaluation(name="accuracy", value=0, comment="No expected output") is_correct = output.strip().lower() == expected_output.strip().lower() return Evaluation( name="accuracy", value=1.0 if is_correct else 0.0, comment="Correct answer" if is_correct else "Incorrect answer" )Multi-metric evaluator:
def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): return [ Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), Evaluation( name="quality", value=0.85, comment="High quality response", metadata={"confidence": 0.92, "model": "gpt-4"} ) ]Categorical evaluation:
def sentiment_evaluator(*, input, output, **kwargs): sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" return Evaluation( name="sentiment", value=sentiment, comment=f"Response expresses {sentiment} sentiment", data_type="CATEGORICAL" )Failed evaluation with error handling:
def external_api_evaluator(*, input, output, **kwargs): try: score = external_api.evaluate(output) return Evaluation(name="external_score", value=score) except Exception as e: return Evaluation( name="external_score", value=0, comment=f"API unavailable: {e}", metadata={"error": str(e), "retry_count": 3} )
Note:
All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.
181 def __init__( 182 self, 183 *, 184 name: str, 185 value: Union[int, float, str, bool], 186 comment: Optional[str] = None, 187 metadata: Optional[Dict[str, Any]] = None, 188 data_type: Optional[ScoreDataType] = None, 189 config_id: Optional[str] = None, 190 ): 191 """Initialize an Evaluation with the provided data. 192 193 Args: 194 name: Unique identifier for the evaluation metric. 195 value: The evaluation score or result. 196 comment: Optional human-readable explanation of the result. 197 metadata: Optional structured metadata about the evaluation process. 198 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 199 config_id: Optional Langfuse score config ID. 200 201 Note: 202 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 203 """ 204 self.name = name 205 self.value = value 206 self.comment = comment 207 self.metadata = metadata 208 self.data_type = data_type 209 self.config_id = config_id
Initialize an Evaluation with the provided data.
Arguments:
- name: Unique identifier for the evaluation metric.
- value: The evaluation score or result.
- comment: Optional human-readable explanation of the result.
- metadata: Optional structured metadata about the evaluation process.
- data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
- config_id: Optional Langfuse score config ID.
Note:
All arguments must be provided as keywords. Positional arguments will raise a TypeError.
40class EvaluatorInputs: 41 """Input data structure for evaluators, returned by mapper functions. 42 43 This class provides a strongly-typed container for transforming API response 44 objects (traces, observations) into the standardized format expected 45 by evaluator functions. It ensures consistent access to input, output, expected 46 output, and metadata regardless of the source entity type. 47 48 Attributes: 49 input: The input data that was provided to generate the output being evaluated. 50 For traces, this might be the initial prompt or request. For observations, 51 this could be the span's input. The exact meaning depends on your use case. 52 output: The actual output that was produced and needs to be evaluated. 53 For traces, this is typically the final response. For observations, 54 this might be the generation output or span result. 55 expected_output: Optional ground truth or expected result for comparison. 56 Used by evaluators to assess correctness. May be None if no ground truth 57 is available for the entity being evaluated. 58 metadata: Optional structured metadata providing additional context for evaluation. 59 Can include information about the entity, execution context, user attributes, 60 or any other relevant data that evaluators might use. 61 62 Examples: 63 Simple mapper for traces: 64 ```python 65 from langfuse import EvaluatorInputs 66 67 def trace_mapper(trace): 68 return EvaluatorInputs( 69 input=trace.input, 70 output=trace.output, 71 expected_output=None, # No ground truth available 72 metadata={"user_id": trace.user_id, "tags": trace.tags} 73 ) 74 ``` 75 76 Mapper for observations extracting specific fields: 77 ```python 78 def observation_mapper(observation): 79 # Extract input/output from observation's data 80 input_data = observation.input if hasattr(observation, 'input') else None 81 output_data = observation.output if hasattr(observation, 'output') else None 82 83 return EvaluatorInputs( 84 input=input_data, 85 output=output_data, 86 expected_output=None, 87 metadata={ 88 "observation_type": observation.type, 89 "model": observation.model, 90 "latency_ms": observation.end_time - observation.start_time 91 } 92 ) 93 ``` 94 ``` 95 96 Note: 97 All arguments must be passed as keywords when instantiating this class. 98 """ 99 100 def __init__( 101 self, 102 *, 103 input: Any, 104 output: Any, 105 expected_output: Any = None, 106 metadata: Optional[Dict[str, Any]] = None, 107 ): 108 """Initialize EvaluatorInputs with the provided data. 109 110 Args: 111 input: The input data for evaluation. 112 output: The output data to be evaluated. 113 expected_output: Optional ground truth for comparison. 114 metadata: Optional additional context for evaluation. 115 116 Note: 117 All arguments must be provided as keywords. 118 """ 119 self.input = input 120 self.output = output 121 self.expected_output = expected_output 122 self.metadata = metadata
Input data structure for evaluators, returned by mapper functions.
This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.
Attributes:
- input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
- output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
- expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
- metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:
Simple mapper for traces:
from langfuse import EvaluatorInputs def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, # No ground truth available metadata={"user_id": trace.user_id, "tags": trace.tags} )Mapper for observations extracting specific fields:
def observation_mapper(observation): # Extract input/output from observation's data input_data = observation.input if hasattr(observation, 'input') else None output_data = observation.output if hasattr(observation, 'output') else None return EvaluatorInputs( input=input_data, output=output_data, expected_output=None, metadata={ "observation_type": observation.type, "model": observation.model, "latency_ms": observation.end_time - observation.start_time } )```
Note:
All arguments must be passed as keywords when instantiating this class.
100 def __init__( 101 self, 102 *, 103 input: Any, 104 output: Any, 105 expected_output: Any = None, 106 metadata: Optional[Dict[str, Any]] = None, 107 ): 108 """Initialize EvaluatorInputs with the provided data. 109 110 Args: 111 input: The input data for evaluation. 112 output: The output data to be evaluated. 113 expected_output: Optional ground truth for comparison. 114 metadata: Optional additional context for evaluation. 115 116 Note: 117 All arguments must be provided as keywords. 118 """ 119 self.input = input 120 self.output = output 121 self.expected_output = expected_output 122 self.metadata = metadata
Initialize EvaluatorInputs with the provided data.
Arguments:
- input: The input data for evaluation.
- output: The output data to be evaluated.
- expected_output: Optional ground truth for comparison.
- metadata: Optional additional context for evaluation.
Note:
All arguments must be provided as keywords.
125class MapperFunction(Protocol): 126 """Protocol defining the interface for mapper functions in batch evaluation. 127 128 Mapper functions transform API response objects (traces or observations) 129 into the standardized EvaluatorInputs format that evaluators expect. This abstraction 130 allows you to define how to extract and structure evaluation data from different 131 entity types. 132 133 Mapper functions must: 134 - Accept a single item parameter (trace, observation) 135 - Return an EvaluatorInputs instance with input, output, expected_output, metadata 136 - Can be either synchronous or asynchronous 137 - Should handle missing or malformed data gracefully 138 """ 139 140 def __call__( 141 self, 142 *, 143 item: Union["TraceWithFullDetails", "ObservationsView"], 144 **kwargs: Dict[str, Any], 145 ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]: 146 """Transform an API response object into evaluator inputs. 147 148 This method defines how to extract evaluation-relevant data from the raw 149 API response object. The implementation should map entity-specific fields 150 to the standardized input/output/expected_output/metadata structure. 151 152 Args: 153 item: The API response object to transform. The type depends on the scope: 154 - TraceWithFullDetails: When evaluating traces 155 - ObservationsView: When evaluating observations 156 157 Returns: 158 EvaluatorInputs: A structured container with: 159 - input: The input data that generated the output 160 - output: The output to be evaluated 161 - expected_output: Optional ground truth for comparison 162 - metadata: Optional additional context 163 164 Can return either a direct EvaluatorInputs instance or an awaitable 165 (for async mappers that need to fetch additional data). 166 167 Examples: 168 Basic trace mapper: 169 ```python 170 def map_trace(trace): 171 return EvaluatorInputs( 172 input=trace.input, 173 output=trace.output, 174 expected_output=None, 175 metadata={"trace_id": trace.id, "user": trace.user_id} 176 ) 177 ``` 178 179 Observation mapper with conditional logic: 180 ```python 181 def map_observation(observation): 182 # Extract fields based on observation type 183 if observation.type == "GENERATION": 184 input_data = observation.input 185 output_data = observation.output 186 else: 187 # For other types, use different fields 188 input_data = observation.metadata.get("input") 189 output_data = observation.metadata.get("output") 190 191 return EvaluatorInputs( 192 input=input_data, 193 output=output_data, 194 expected_output=None, 195 metadata={"obs_id": observation.id, "type": observation.type} 196 ) 197 ``` 198 199 Async mapper (if additional processing needed): 200 ```python 201 async def map_trace_async(trace): 202 # Could do async processing here if needed 203 processed_output = await some_async_transformation(trace.output) 204 205 return EvaluatorInputs( 206 input=trace.input, 207 output=processed_output, 208 expected_output=None, 209 metadata={"trace_id": trace.id} 210 ) 211 ``` 212 """ 213 ...
Protocol defining the interface for mapper functions in batch evaluation.
Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.
Mapper functions must:
- Accept a single item parameter (trace, observation)
- Return an EvaluatorInputs instance with input, output, expected_output, metadata
- Can be either synchronous or asynchronous
- Should handle missing or malformed data gracefully
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
216class CompositeEvaluatorFunction(Protocol): 217 """Protocol defining the interface for composite evaluator functions. 218 219 Composite evaluators create aggregate scores from multiple item-level evaluations. 220 This is commonly used to compute weighted averages, combined metrics, or other 221 composite assessments based on individual evaluation results. 222 223 Composite evaluators: 224 - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) 225 plus the list of evaluations 226 - Return either a single Evaluation, a list of Evaluations, or a dict 227 - Can be either synchronous or asynchronous 228 - Have access to both raw item data and evaluation results 229 """ 230 231 def __call__( 232 self, 233 *, 234 input: Optional[Any] = None, 235 output: Optional[Any] = None, 236 expected_output: Optional[Any] = None, 237 metadata: Optional[Dict[str, Any]] = None, 238 evaluations: List[Evaluation], 239 **kwargs: Dict[str, Any], 240 ) -> Union[ 241 Evaluation, 242 List[Evaluation], 243 Dict[str, Any], 244 Awaitable[Evaluation], 245 Awaitable[List[Evaluation]], 246 Awaitable[Dict[str, Any]], 247 ]: 248 r"""Create a composite evaluation from item-level evaluation results. 249 250 This method combines multiple evaluation scores into a single composite metric. 251 Common use cases include weighted averages, pass/fail decisions based on multiple 252 criteria, or custom scoring logic that considers multiple dimensions. 253 254 Args: 255 input: The input data that was provided to the system being evaluated. 256 output: The output generated by the system being evaluated. 257 expected_output: The expected/reference output for comparison (if available). 258 metadata: Additional metadata about the evaluation context. 259 evaluations: List of evaluation results from item-level evaluators. 260 Each evaluation contains name, value, comment, and metadata. 261 262 Returns: 263 Can return any of: 264 - Evaluation: A single composite evaluation result 265 - List[Evaluation]: Multiple composite evaluations 266 - Dict: A dict that will be converted to an Evaluation 267 - name: Identifier for the composite metric (e.g., "composite_score") 268 - value: The computed composite value 269 - comment: Optional explanation of how the score was computed 270 - metadata: Optional details about the composition logic 271 272 Can return either a direct Evaluation instance or an awaitable 273 (for async composite evaluators). 274 275 Examples: 276 Simple weighted average: 277 ```python 278 def weighted_composite(*, input, output, expected_output, metadata, evaluations): 279 weights = { 280 "accuracy": 0.5, 281 "relevance": 0.3, 282 "safety": 0.2 283 } 284 285 total_score = 0.0 286 total_weight = 0.0 287 288 for eval in evaluations: 289 if eval.name in weights and isinstance(eval.value, (int, float)): 290 total_score += eval.value * weights[eval.name] 291 total_weight += weights[eval.name] 292 293 final_score = total_score / total_weight if total_weight > 0 else 0.0 294 295 return Evaluation( 296 name="composite_score", 297 value=final_score, 298 comment=f"Weighted average of {len(evaluations)} metrics" 299 ) 300 ``` 301 302 Pass/fail composite based on thresholds: 303 ```python 304 def pass_fail_composite(*, input, output, expected_output, metadata, evaluations): 305 # Must pass all criteria 306 thresholds = { 307 "accuracy": 0.7, 308 "safety": 0.9, 309 "relevance": 0.6 310 } 311 312 passes = True 313 failing_metrics = [] 314 315 for metric, threshold in thresholds.items(): 316 eval_result = next((e for e in evaluations if e.name == metric), None) 317 if eval_result and isinstance(eval_result.value, (int, float)): 318 if eval_result.value < threshold: 319 passes = False 320 failing_metrics.append(metric) 321 322 return Evaluation( 323 name="passes_all_checks", 324 value=passes, 325 comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed", 326 data_type="BOOLEAN" 327 ) 328 ``` 329 330 Async composite with external scoring: 331 ```python 332 async def llm_composite(*, input, output, expected_output, metadata, evaluations): 333 # Use LLM to synthesize multiple evaluation results 334 eval_summary = "\n".join( 335 f"- {e.name}: {e.value}" for e in evaluations 336 ) 337 338 prompt = f"Given these evaluation scores:\n{eval_summary}\n" 339 prompt += f"For the output: {output}\n" 340 prompt += "Provide an overall quality score from 0-1." 341 342 response = await openai.chat.completions.create( 343 model="gpt-4", 344 messages=[{"role": "user", "content": prompt}] 345 ) 346 347 score = float(response.choices[0].message.content.strip()) 348 349 return Evaluation( 350 name="llm_composite_score", 351 value=score, 352 comment="LLM-synthesized composite score" 353 ) 354 ``` 355 356 Context-aware composite: 357 ```python 358 def context_composite(*, input, output, expected_output, metadata, evaluations): 359 # Adjust weighting based on metadata 360 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2} 361 362 # If metadata indicates high importance, prioritize accuracy 363 if metadata and metadata.get('importance') == 'high': 364 weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1} 365 else: 366 weights = base_weights 367 368 total = sum( 369 e.value * weights.get(e.name, 0) 370 for e in evaluations 371 if isinstance(e.value, (int, float)) 372 ) 373 374 return Evaluation( 375 name="weighted_composite", 376 value=total, 377 comment="Context-aware weighted composite" 378 ) 379 ``` 380 """ 381 ...
Protocol defining the interface for composite evaluator functions.
Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.
Composite evaluators:
- Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
- Return either a single Evaluation, a list of Evaluations, or a dict
- Can be either synchronous or asynchronous
- Have access to both raw item data and evaluation results
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
384class EvaluatorStats: 385 """Statistics for a single evaluator's performance during batch evaluation. 386 387 This class tracks detailed metrics about how a specific evaluator performed 388 across all items in a batch evaluation run. It helps identify evaluator issues, 389 understand reliability, and optimize evaluation pipelines. 390 391 Attributes: 392 name: The name of the evaluator function (extracted from __name__). 393 total_runs: Total number of times the evaluator was invoked. 394 successful_runs: Number of times the evaluator completed successfully. 395 failed_runs: Number of times the evaluator raised an exception or failed. 396 total_scores_created: Total number of evaluation scores created by this evaluator. 397 Can be higher than successful_runs if the evaluator returns multiple scores. 398 399 Examples: 400 Accessing evaluator stats from batch evaluation result: 401 ```python 402 result = client.run_batched_evaluation(...) 403 404 for stats in result.evaluator_stats: 405 print(f"Evaluator: {stats.name}") 406 print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") 407 print(f" Scores created: {stats.total_scores_created}") 408 409 if stats.failed_runs > 0: 410 print(f" â ī¸ Failed {stats.failed_runs} times") 411 ``` 412 413 Identifying problematic evaluators: 414 ```python 415 result = client.run_batched_evaluation(...) 416 417 # Find evaluators with high failure rates 418 for stats in result.evaluator_stats: 419 failure_rate = stats.failed_runs / stats.total_runs 420 if failure_rate > 0.1: # More than 10% failures 421 print(f"â ī¸ {stats.name} has {failure_rate:.1%} failure rate") 422 print(f" Consider debugging or removing this evaluator") 423 ``` 424 425 Note: 426 All arguments must be passed as keywords when instantiating this class. 427 """ 428 429 def __init__( 430 self, 431 *, 432 name: str, 433 total_runs: int = 0, 434 successful_runs: int = 0, 435 failed_runs: int = 0, 436 total_scores_created: int = 0, 437 ): 438 """Initialize EvaluatorStats with the provided metrics. 439 440 Args: 441 name: The evaluator function name. 442 total_runs: Total number of evaluator invocations. 443 successful_runs: Number of successful completions. 444 failed_runs: Number of failures. 445 total_scores_created: Total scores created by this evaluator. 446 447 Note: 448 All arguments must be provided as keywords. 449 """ 450 self.name = name 451 self.total_runs = total_runs 452 self.successful_runs = successful_runs 453 self.failed_runs = failed_runs 454 self.total_scores_created = total_scores_created
Statistics for a single evaluator's performance during batch evaluation.
This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.
Attributes:
- name: The name of the evaluator function (extracted from __name__).
- total_runs: Total number of times the evaluator was invoked.
- successful_runs: Number of times the evaluator completed successfully.
- failed_runs: Number of times the evaluator raised an exception or failed.
- total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:
Accessing evaluator stats from batch evaluation result:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: print(f"Evaluator: {stats.name}") print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failed {stats.failed_runs} times")Identifying problematic evaluators:
result = client.run_batched_evaluation(...) # Find evaluators with high failure rates for stats in result.evaluator_stats: failure_rate = stats.failed_runs / stats.total_runs if failure_rate > 0.1: # More than 10% failures print(f"â ī¸ {stats.name} has {failure_rate:.1%} failure rate") print(f" Consider debugging or removing this evaluator")
Note:
All arguments must be passed as keywords when instantiating this class.
429 def __init__( 430 self, 431 *, 432 name: str, 433 total_runs: int = 0, 434 successful_runs: int = 0, 435 failed_runs: int = 0, 436 total_scores_created: int = 0, 437 ): 438 """Initialize EvaluatorStats with the provided metrics. 439 440 Args: 441 name: The evaluator function name. 442 total_runs: Total number of evaluator invocations. 443 successful_runs: Number of successful completions. 444 failed_runs: Number of failures. 445 total_scores_created: Total scores created by this evaluator. 446 447 Note: 448 All arguments must be provided as keywords. 449 """ 450 self.name = name 451 self.total_runs = total_runs 452 self.successful_runs = successful_runs 453 self.failed_runs = failed_runs 454 self.total_scores_created = total_scores_created
Initialize EvaluatorStats with the provided metrics.
Arguments:
- name: The evaluator function name.
- total_runs: Total number of evaluator invocations.
- successful_runs: Number of successful completions.
- failed_runs: Number of failures.
- total_scores_created: Total scores created by this evaluator.
Note:
All arguments must be provided as keywords.
457class BatchEvaluationResumeToken: 458 """Token for resuming a failed batch evaluation run. 459 460 This class encapsulates all the information needed to resume a batch evaluation 461 that was interrupted or failed partway through. It uses timestamp-based filtering 462 to avoid re-processing items that were already evaluated, even if the underlying 463 dataset changed between runs. 464 465 Attributes: 466 scope: The type of items being evaluated ("traces", "observations"). 467 filter: The original JSON filter string used to query items. 468 last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. 469 Used to construct a filter that only fetches items after this timestamp. 470 last_processed_id: The ID of the last successfully processed item, for reference. 471 items_processed: Count of items successfully processed before interruption. 472 473 Examples: 474 Resuming a failed batch evaluation: 475 ```python 476 # Initial run that fails partway through 477 try: 478 result = client.run_batched_evaluation( 479 scope="traces", 480 mapper=my_mapper, 481 evaluators=[evaluator1, evaluator2], 482 filter='{"tags": ["production"]}', 483 max_items=10000 484 ) 485 except Exception as e: 486 print(f"Evaluation failed: {e}") 487 488 # Save the resume token 489 if result.resume_token: 490 # Store resume token for later (e.g., in a file or database) 491 import json 492 with open("resume_token.json", "w") as f: 493 json.dump({ 494 "scope": result.resume_token.scope, 495 "filter": result.resume_token.filter, 496 "last_timestamp": result.resume_token.last_processed_timestamp, 497 "last_id": result.resume_token.last_processed_id, 498 "items_done": result.resume_token.items_processed 499 }, f) 500 501 # Later, resume from where it left off 502 with open("resume_token.json") as f: 503 token_data = json.load(f) 504 505 resume_token = BatchEvaluationResumeToken( 506 scope=token_data["scope"], 507 filter=token_data["filter"], 508 last_processed_timestamp=token_data["last_timestamp"], 509 last_processed_id=token_data["last_id"], 510 items_processed=token_data["items_done"] 511 ) 512 513 # Resume the evaluation 514 result = client.run_batched_evaluation( 515 scope="traces", 516 mapper=my_mapper, 517 evaluators=[evaluator1, evaluator2], 518 resume_from=resume_token 519 ) 520 521 print(f"Processed {result.total_items_processed} additional items") 522 ``` 523 524 Handling partial completion: 525 ```python 526 result = client.run_batched_evaluation(...) 527 528 if not result.completed: 529 print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") 530 print(f"Last item: {result.resume_token.last_processed_id}") 531 print(f"Resume from: {result.resume_token.last_processed_timestamp}") 532 533 # Optionally retry automatically 534 if result.resume_token: 535 print("Retrying...") 536 result = client.run_batched_evaluation( 537 scope=result.resume_token.scope, 538 mapper=my_mapper, 539 evaluators=my_evaluators, 540 resume_from=result.resume_token 541 ) 542 ``` 543 544 Note: 545 All arguments must be passed as keywords when instantiating this class. 546 The timestamp-based approach means that items created after the initial run 547 but before the timestamp will be skipped. This is intentional to avoid 548 duplicates and ensure consistent evaluation. 549 """ 550 551 def __init__( 552 self, 553 *, 554 scope: str, 555 filter: Optional[str], 556 last_processed_timestamp: str, 557 last_processed_id: str, 558 items_processed: int, 559 ): 560 """Initialize BatchEvaluationResumeToken with the provided state. 561 562 Args: 563 scope: The scope type ("traces", "observations"). 564 filter: The original JSON filter string. 565 last_processed_timestamp: ISO 8601 timestamp of last processed item. 566 last_processed_id: ID of last processed item. 567 items_processed: Count of items processed before interruption. 568 569 Note: 570 All arguments must be provided as keywords. 571 """ 572 self.scope = scope 573 self.filter = filter 574 self.last_processed_timestamp = last_processed_timestamp 575 self.last_processed_id = last_processed_id 576 self.items_processed = items_processed
Token for resuming a failed batch evaluation run.
This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.
Attributes:
- scope: The type of items being evaluated ("traces", "observations").
- filter: The original JSON filter string used to query items.
- last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
- last_processed_id: The ID of the last successfully processed item, for reference.
- items_processed: Count of items successfully processed before interruption.
Examples:
Resuming a failed batch evaluation:
# Initial run that fails partway through try: result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], filter='{"tags": ["production"]}', max_items=10000 ) except Exception as e: print(f"Evaluation failed: {e}") # Save the resume token if result.resume_token: # Store resume token for later (e.g., in a file or database) import json with open("resume_token.json", "w") as f: json.dump({ "scope": result.resume_token.scope, "filter": result.resume_token.filter, "last_timestamp": result.resume_token.last_processed_timestamp, "last_id": result.resume_token.last_processed_id, "items_done": result.resume_token.items_processed }, f) # Later, resume from where it left off with open("resume_token.json") as f: token_data = json.load(f) resume_token = BatchEvaluationResumeToken( scope=token_data["scope"], filter=token_data["filter"], last_processed_timestamp=token_data["last_timestamp"], last_processed_id=token_data["last_id"], items_processed=token_data["items_done"] ) # Resume the evaluation result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], resume_from=resume_token ) print(f"Processed {result.total_items_processed} additional items")Handling partial completion:
result = client.run_batched_evaluation(...) if not result.completed: print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") print(f"Last item: {result.resume_token.last_processed_id}") print(f"Resume from: {result.resume_token.last_processed_timestamp}") # Optionally retry automatically if result.resume_token: print("Retrying...") result = client.run_batched_evaluation( scope=result.resume_token.scope, mapper=my_mapper, evaluators=my_evaluators, resume_from=result.resume_token )
Note:
All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.
551 def __init__( 552 self, 553 *, 554 scope: str, 555 filter: Optional[str], 556 last_processed_timestamp: str, 557 last_processed_id: str, 558 items_processed: int, 559 ): 560 """Initialize BatchEvaluationResumeToken with the provided state. 561 562 Args: 563 scope: The scope type ("traces", "observations"). 564 filter: The original JSON filter string. 565 last_processed_timestamp: ISO 8601 timestamp of last processed item. 566 last_processed_id: ID of last processed item. 567 items_processed: Count of items processed before interruption. 568 569 Note: 570 All arguments must be provided as keywords. 571 """ 572 self.scope = scope 573 self.filter = filter 574 self.last_processed_timestamp = last_processed_timestamp 575 self.last_processed_id = last_processed_id 576 self.items_processed = items_processed
Initialize BatchEvaluationResumeToken with the provided state.
Arguments:
- scope: The scope type ("traces", "observations").
- filter: The original JSON filter string.
- last_processed_timestamp: ISO 8601 timestamp of last processed item.
- last_processed_id: ID of last processed item.
- items_processed: Count of items processed before interruption.
Note:
All arguments must be provided as keywords.
579class BatchEvaluationResult: 580 r"""Complete result structure for batch evaluation execution. 581 582 This class encapsulates comprehensive statistics and metadata about a batch 583 evaluation run, including counts, evaluator-specific metrics, timing information, 584 error details, and resume capability. 585 586 Attributes: 587 total_items_fetched: Total number of items fetched from the API. 588 total_items_processed: Number of items successfully evaluated. 589 total_items_failed: Number of items that failed during evaluation. 590 total_scores_created: Total scores created by all item-level evaluators. 591 total_composite_scores_created: Scores created by the composite evaluator. 592 total_evaluations_failed: Number of individual evaluator failures across all items. 593 evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created). 594 resume_token: Token for resuming if evaluation was interrupted (None if completed). 595 completed: True if all items were processed, False if stopped early or failed. 596 duration_seconds: Total time taken to execute the batch evaluation. 597 failed_item_ids: List of IDs for items that failed evaluation. 598 error_summary: Dictionary mapping error types to occurrence counts. 599 has_more_items: True if max_items limit was reached but more items exist. 600 item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite). 601 602 Examples: 603 Basic result inspection: 604 ```python 605 result = client.run_batched_evaluation(...) 606 607 print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") 608 print(f"Scores created: {result.total_scores_created}") 609 print(f"Duration: {result.duration_seconds:.2f}s") 610 print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}") 611 ``` 612 613 Detailed analysis with evaluator stats: 614 ```python 615 result = client.run_batched_evaluation(...) 616 617 print(f"\nđ Batch Evaluation Results") 618 print(f"{'='*50}") 619 print(f"Items processed: {result.total_items_processed}") 620 print(f"Items failed: {result.total_items_failed}") 621 print(f"Scores created: {result.total_scores_created}") 622 623 if result.total_composite_scores_created > 0: 624 print(f"Composite scores: {result.total_composite_scores_created}") 625 626 print(f"\nđ Evaluator Performance:") 627 for stats in result.evaluator_stats: 628 success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 629 print(f"\n {stats.name}:") 630 print(f" Success rate: {success_rate:.1%}") 631 print(f" Scores created: {stats.total_scores_created}") 632 if stats.failed_runs > 0: 633 print(f" â ī¸ Failures: {stats.failed_runs}") 634 635 if result.error_summary: 636 print(f"\nâ ī¸ Errors encountered:") 637 for error_type, count in result.error_summary.items(): 638 print(f" {error_type}: {count}") 639 ``` 640 641 Handling incomplete runs: 642 ```python 643 result = client.run_batched_evaluation(...) 644 645 if not result.completed: 646 print("â ī¸ Evaluation incomplete!") 647 648 if result.resume_token: 649 print(f"Processed {result.resume_token.items_processed} items before failure") 650 print(f"Use resume_from parameter to continue from:") 651 print(f" Timestamp: {result.resume_token.last_processed_timestamp}") 652 print(f" Last ID: {result.resume_token.last_processed_id}") 653 654 if result.has_more_items: 655 print(f"âšī¸ More items available beyond max_items limit") 656 ``` 657 658 Performance monitoring: 659 ```python 660 result = client.run_batched_evaluation(...) 661 662 items_per_second = result.total_items_processed / result.duration_seconds 663 avg_scores_per_item = result.total_scores_created / result.total_items_processed 664 665 print(f"Performance metrics:") 666 print(f" Throughput: {items_per_second:.2f} items/second") 667 print(f" Avg scores/item: {avg_scores_per_item:.2f}") 668 print(f" Total duration: {result.duration_seconds:.2f}s") 669 670 if result.total_evaluations_failed > 0: 671 failure_rate = result.total_evaluations_failed / ( 672 result.total_items_processed * len(result.evaluator_stats) 673 ) 674 print(f" Evaluation failure rate: {failure_rate:.1%}") 675 ``` 676 677 Note: 678 All arguments must be passed as keywords when instantiating this class. 679 """ 680 681 def __init__( 682 self, 683 *, 684 total_items_fetched: int, 685 total_items_processed: int, 686 total_items_failed: int, 687 total_scores_created: int, 688 total_composite_scores_created: int, 689 total_evaluations_failed: int, 690 evaluator_stats: List[EvaluatorStats], 691 resume_token: Optional[BatchEvaluationResumeToken], 692 completed: bool, 693 duration_seconds: float, 694 failed_item_ids: List[str], 695 error_summary: Dict[str, int], 696 has_more_items: bool, 697 item_evaluations: Dict[str, List["Evaluation"]], 698 ): 699 """Initialize BatchEvaluationResult with comprehensive statistics. 700 701 Args: 702 total_items_fetched: Total items fetched from API. 703 total_items_processed: Items successfully evaluated. 704 total_items_failed: Items that failed evaluation. 705 total_scores_created: Scores from item-level evaluators. 706 total_composite_scores_created: Scores from composite evaluator. 707 total_evaluations_failed: Individual evaluator failures. 708 evaluator_stats: Per-evaluator statistics. 709 resume_token: Token for resuming (None if completed). 710 completed: Whether all items were processed. 711 duration_seconds: Total execution time. 712 failed_item_ids: IDs of failed items. 713 error_summary: Error types and counts. 714 has_more_items: Whether more items exist beyond max_items. 715 item_evaluations: Dictionary mapping item IDs to their evaluation results. 716 717 Note: 718 All arguments must be provided as keywords. 719 """ 720 self.total_items_fetched = total_items_fetched 721 self.total_items_processed = total_items_processed 722 self.total_items_failed = total_items_failed 723 self.total_scores_created = total_scores_created 724 self.total_composite_scores_created = total_composite_scores_created 725 self.total_evaluations_failed = total_evaluations_failed 726 self.evaluator_stats = evaluator_stats 727 self.resume_token = resume_token 728 self.completed = completed 729 self.duration_seconds = duration_seconds 730 self.failed_item_ids = failed_item_ids 731 self.error_summary = error_summary 732 self.has_more_items = has_more_items 733 self.item_evaluations = item_evaluations 734 735 def __str__(self) -> str: 736 """Return a formatted string representation of the batch evaluation results. 737 738 Returns: 739 A multi-line string with a summary of the evaluation results. 740 """ 741 lines = [] 742 lines.append("=" * 60) 743 lines.append("Batch Evaluation Results") 744 lines.append("=" * 60) 745 746 # Summary statistics 747 lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}") 748 lines.append(f"Duration: {self.duration_seconds:.2f}s") 749 lines.append(f"\nItems fetched: {self.total_items_fetched}") 750 lines.append(f"Items processed: {self.total_items_processed}") 751 752 if self.total_items_failed > 0: 753 lines.append(f"Items failed: {self.total_items_failed}") 754 755 # Success rate 756 if self.total_items_fetched > 0: 757 success_rate = self.total_items_processed / self.total_items_fetched * 100 758 lines.append(f"Success rate: {success_rate:.1f}%") 759 760 # Scores created 761 lines.append(f"\nScores created: {self.total_scores_created}") 762 if self.total_composite_scores_created > 0: 763 lines.append(f"Composite scores: {self.total_composite_scores_created}") 764 765 total_scores = self.total_scores_created + self.total_composite_scores_created 766 lines.append(f"Total scores: {total_scores}") 767 768 # Evaluator statistics 769 if self.evaluator_stats: 770 lines.append("\nEvaluator Performance:") 771 for stats in self.evaluator_stats: 772 lines.append(f" {stats.name}:") 773 if stats.total_runs > 0: 774 success_rate = ( 775 stats.successful_runs / stats.total_runs * 100 776 if stats.total_runs > 0 777 else 0 778 ) 779 lines.append( 780 f" Runs: {stats.successful_runs}/{stats.total_runs} " 781 f"({success_rate:.1f}% success)" 782 ) 783 lines.append(f" Scores created: {stats.total_scores_created}") 784 if stats.failed_runs > 0: 785 lines.append(f" Failed runs: {stats.failed_runs}") 786 787 # Performance metrics 788 if self.total_items_processed > 0 and self.duration_seconds > 0: 789 items_per_sec = self.total_items_processed / self.duration_seconds 790 lines.append("\nPerformance:") 791 lines.append(f" Throughput: {items_per_sec:.2f} items/second") 792 if self.total_scores_created > 0: 793 avg_scores = self.total_scores_created / self.total_items_processed 794 lines.append(f" Avg scores per item: {avg_scores:.2f}") 795 796 # Errors and warnings 797 if self.error_summary: 798 lines.append("\nErrors encountered:") 799 for error_type, count in self.error_summary.items(): 800 lines.append(f" {error_type}: {count}") 801 802 # Incomplete run information 803 if not self.completed: 804 lines.append("\nWarning: Evaluation incomplete") 805 if self.resume_token: 806 lines.append( 807 f" Last processed: {self.resume_token.last_processed_timestamp}" 808 ) 809 lines.append(f" Items processed: {self.resume_token.items_processed}") 810 lines.append(" Use resume_from parameter to continue") 811 812 if self.has_more_items: 813 lines.append("\nNote: More items available beyond max_items limit") 814 815 lines.append("=" * 60) 816 return "\n".join(lines)
Complete result structure for batch evaluation execution.
This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.
Attributes:
- total_items_fetched: Total number of items fetched from the API.
- total_items_processed: Number of items successfully evaluated.
- total_items_failed: Number of items that failed during evaluation.
- total_scores_created: Total scores created by all item-level evaluators.
- total_composite_scores_created: Scores created by the composite evaluator.
- total_evaluations_failed: Number of individual evaluator failures across all items.
- evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
- resume_token: Token for resuming if evaluation was interrupted (None if completed).
- completed: True if all items were processed, False if stopped early or failed.
- duration_seconds: Total time taken to execute the batch evaluation.
- failed_item_ids: List of IDs for items that failed evaluation.
- error_summary: Dictionary mapping error types to occurrence counts.
- has_more_items: True if max_items limit was reached but more items exist.
- item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:
Basic result inspection:
result = client.run_batched_evaluation(...) print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") print(f"Scores created: {result.total_scores_created}") print(f"Duration: {result.duration_seconds:.2f}s") print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")Detailed analysis with evaluator stats:
result = client.run_batched_evaluation(...) print(f"\nđ Batch Evaluation Results") print(f"{'='*50}") print(f"Items processed: {result.total_items_processed}") print(f"Items failed: {result.total_items_failed}") print(f"Scores created: {result.total_scores_created}") if result.total_composite_scores_created > 0: print(f"Composite scores: {result.total_composite_scores_created}") print(f"\nđ Evaluator Performance:") for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 print(f"\n {stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" â ī¸ Failures: {stats.failed_runs}") if result.error_summary: print(f"\nâ ī¸ Errors encountered:") for error_type, count in result.error_summary.items(): print(f" {error_type}: {count}")Handling incomplete runs:
result = client.run_batched_evaluation(...) if not result.completed: print("â ī¸ Evaluation incomplete!") if result.resume_token: print(f"Processed {result.resume_token.items_processed} items before failure") print(f"Use resume_from parameter to continue from:") print(f" Timestamp: {result.resume_token.last_processed_timestamp}") print(f" Last ID: {result.resume_token.last_processed_id}") if result.has_more_items: print(f"âšī¸ More items available beyond max_items limit")Performance monitoring:
result = client.run_batched_evaluation(...) items_per_second = result.total_items_processed / result.duration_seconds avg_scores_per_item = result.total_scores_created / result.total_items_processed print(f"Performance metrics:") print(f" Throughput: {items_per_second:.2f} items/second") print(f" Avg scores/item: {avg_scores_per_item:.2f}") print(f" Total duration: {result.duration_seconds:.2f}s") if result.total_evaluations_failed > 0: failure_rate = result.total_evaluations_failed / ( result.total_items_processed * len(result.evaluator_stats) ) print(f" Evaluation failure rate: {failure_rate:.1%}")
Note:
All arguments must be passed as keywords when instantiating this class.
681 def __init__( 682 self, 683 *, 684 total_items_fetched: int, 685 total_items_processed: int, 686 total_items_failed: int, 687 total_scores_created: int, 688 total_composite_scores_created: int, 689 total_evaluations_failed: int, 690 evaluator_stats: List[EvaluatorStats], 691 resume_token: Optional[BatchEvaluationResumeToken], 692 completed: bool, 693 duration_seconds: float, 694 failed_item_ids: List[str], 695 error_summary: Dict[str, int], 696 has_more_items: bool, 697 item_evaluations: Dict[str, List["Evaluation"]], 698 ): 699 """Initialize BatchEvaluationResult with comprehensive statistics. 700 701 Args: 702 total_items_fetched: Total items fetched from API. 703 total_items_processed: Items successfully evaluated. 704 total_items_failed: Items that failed evaluation. 705 total_scores_created: Scores from item-level evaluators. 706 total_composite_scores_created: Scores from composite evaluator. 707 total_evaluations_failed: Individual evaluator failures. 708 evaluator_stats: Per-evaluator statistics. 709 resume_token: Token for resuming (None if completed). 710 completed: Whether all items were processed. 711 duration_seconds: Total execution time. 712 failed_item_ids: IDs of failed items. 713 error_summary: Error types and counts. 714 has_more_items: Whether more items exist beyond max_items. 715 item_evaluations: Dictionary mapping item IDs to their evaluation results. 716 717 Note: 718 All arguments must be provided as keywords. 719 """ 720 self.total_items_fetched = total_items_fetched 721 self.total_items_processed = total_items_processed 722 self.total_items_failed = total_items_failed 723 self.total_scores_created = total_scores_created 724 self.total_composite_scores_created = total_composite_scores_created 725 self.total_evaluations_failed = total_evaluations_failed 726 self.evaluator_stats = evaluator_stats 727 self.resume_token = resume_token 728 self.completed = completed 729 self.duration_seconds = duration_seconds 730 self.failed_item_ids = failed_item_ids 731 self.error_summary = error_summary 732 self.has_more_items = has_more_items 733 self.item_evaluations = item_evaluations
Initialize BatchEvaluationResult with comprehensive statistics.
Arguments:
- total_items_fetched: Total items fetched from API.
- total_items_processed: Items successfully evaluated.
- total_items_failed: Items that failed evaluation.
- total_scores_created: Scores from item-level evaluators.
- total_composite_scores_created: Scores from composite evaluator.
- total_evaluations_failed: Individual evaluator failures.
- evaluator_stats: Per-evaluator statistics.
- resume_token: Token for resuming (None if completed).
- completed: Whether all items were processed.
- duration_seconds: Total execution time.
- failed_item_ids: IDs of failed items.
- error_summary: Error types and counts.
- has_more_items: Whether more items exist beyond max_items.
- item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:
All arguments must be provided as keywords.
73def is_default_export_span(span: ReadableSpan) -> bool: 74 """Return whether a span should be exported by default.""" 75 return ( 76 is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span) 77 )
Return whether a span should be exported by default.
36def is_langfuse_span(span: ReadableSpan) -> bool: 37 """Return whether the span was created by the Langfuse SDK tracer.""" 38 return ( 39 span.instrumentation_scope is not None 40 and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME 41 )
Return whether the span was created by the Langfuse SDK tracer.
44def is_genai_span(span: ReadableSpan) -> bool: 45 """Return whether the span has any ``gen_ai.*`` semantic convention attribute.""" 46 if span.attributes is None: 47 return False 48 49 return any( 50 isinstance(key, str) and key.startswith("gen_ai") 51 for key in span.attributes.keys() 52 )
Return whether the span has any gen_ai.* semantic convention attribute.
60def is_known_llm_instrumentor(span: ReadableSpan) -> bool: 61 """Return whether the span comes from a known LLM instrumentation scope.""" 62 if span.instrumentation_scope is None: 63 return False 64 65 scope_name = span.instrumentation_scope.name 66 67 return any( 68 _matches_scope_prefix(scope_name, prefix) 69 for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES 70 )
Return whether the span comes from a known LLM instrumentation scope.