Few-Shot Examples
Providing clear examples of when and how to use each tool
AI agents often struggle with effectively using tools at their disposal. At VrealSoft, we’ve developed comprehensive approaches to improve tool usage.
# Example of a well-structured tool definitiontools = [ { "name": "search_database", "description": "Search the customer database for matching records", "parameters": { "query": { "type": "string", "description": "The search term to look for" }, "limit": { "type": "integer", "description": "Maximum number of results to return", "default": 5 } }, "returns": { "type": "array", "items": { "type": "object", "properties": { "id": "string", "name": "string", "details": "object" } } }, "examples": [ { "input": {"query": "customer_id:12345", "limit": 1}, "output": [{"id": "12345", "name": "John Smith", "details": {...}}] } ] }]Few-Shot Examples
Providing clear examples of when and how to use each tool
Tool Taxonomies
Organizing tools into categories to aid in selection
Decision Trees
Explicit guidance on which situations warrant specific tools
Agents often struggle with correctly formatting parameters for tools. Our approaches include:
def validate_and_repair_parameters(tool_call): tool_name = tool_call["name"] params = tool_call["parameters"]
# Get tool definition tool_def = get_tool_definition(tool_name)
# Validate required parameters for param_name, param_def in tool_def["parameters"].items(): if param_def.get("required", False) and param_name not in params: return { "valid": False, "error": f"Missing required parameter: {param_name}", "suggestion": f"Add the {param_name} parameter, which should be a {param_def['type']}" }
# Type checking and repairs for param_name, param_value in params.items(): if param_name in tool_def["parameters"]: param_def = tool_def["parameters"][param_name]
# Type validation if not validate_type(param_value, param_def["type"]): # Try to repair repaired_value = attempt_repair(param_value, param_def["type"]) if repaired_value: params[param_name] = repaired_value else: return { "valid": False, "error": f"Parameter {param_name} has incorrect type", "suggestion": f"Expected {param_def['type']}, got {type(param_value).__name__}" }
return {"valid": True, "parameters": params}async def execute_tool_with_correction(agent, tool_call): # Validate parameters validation = validate_and_repair_parameters(tool_call)
if validation["valid"]: # Execute the tool if parameters are valid return await execute_tool(tool_call["name"], validation["parameters"]) else: # Guide the agent to fix the issues correction_prompt = f""" Your tool call had the following issue: {validation['error']}
Suggestion: {validation['suggestion']}
Please try again with fixed parameters. """
# Get corrected tool call from agent corrected_call = await agent.generate_response(correction_prompt)
# Extract and validate the new tool call new_tool_call = extract_tool_call(corrected_call) return await execute_tool_with_correction(agent, new_tool_call)Our approach includes:
# Example of structured tool resultdef format_tool_result(raw_result, tool_name): # Create a structured result with metadata formatted_result = { "tool_name": tool_name, "timestamp": datetime.now().isoformat(), "status": "success" if raw_result else "no_results", "result_type": determine_result_type(raw_result), "result_summary": summarize_result(raw_result), "result": raw_result, "usage_guidance": get_usage_guidance(tool_name, raw_result) }
return formatted_resultTool Chains
Predefined sequences of tools for common tasks
Reflective Execution
Having agents evaluate the effectiveness of tool use
Iterative Refinement
Progressively improving tool calls based on results
We’ve developed a comprehensive training and evaluation system for tool use:
tool_use_training = { "methods": [ { "name": "demonstration_learning", "description": "Providing examples of correct tool usage", "examples_per_tool": 5 }, { "name": "error_correction", "description": "Showing common mistakes and their fixes", "examples_per_error_type": 3 }, { "name": "tool_exploration", "description": "Guided discovery of tool capabilities", "exploration_tasks_per_tool": 2 } ], "evaluation": { "metrics": [ "selection_accuracy", # Using the right tool "parameter_correctness", # Formatting parameters properly "result_utilization", # Using the results effectively "efficiency" # Minimizing unnecessary tool calls ], "test_scenarios": 100, # Number of evaluation scenarios "human_verification": True # Human review of complex tool use }}def evaluate_tool_usage(agent, test_cases): results = { "tool_selection": { "correct": 0, "incorrect": 0, "missed_opportunities": 0 }, "parameter_formulation": { "correct": 0, "fixable_errors": 0, "critical_errors": 0 }, "result_interpretation": { "fully_utilized": 0, "partially_utilized": 0, "misinterpreted": 0 }, "efficiency": { "optimal_calls": 0, "acceptable_calls": 0, "excessive_calls": 0 } }
for case in test_cases: # Run the agent on the test case trace = run_agent_with_tracing(agent, case.input)
# Evaluate the agent's performance case_results = evaluate_trace(trace, case.expected)
# Update overall results update_results(results, case_results)
return calculate_metrics(results)We track tool usage patterns and failures to continuously improve our agents:
# Tool usage monitoring systemclass ToolUsageMonitor: def __init__(self): self.usage_stats = {} self.failure_patterns = {} self.improvement_suggestions = []
def record_tool_call(self, agent_id, tool_call, result, success): # Record basic usage statistics tool_name = tool_call["name"] if tool_name not in self.usage_stats: self.usage_stats[tool_name] = { "calls": 0, "successes": 0, "failures": 0, "params": {} }
self.usage_stats[tool_name]["calls"] += 1 if success: self.usage_stats[tool_name]["successes"] += 1 else: self.usage_stats[tool_name]["failures"] += 1 self.record_failure_pattern(tool_name, tool_call, result)
def record_failure_pattern(self, tool_name, tool_call, result): # Analyze and categorize failure patterns failure_type = categorize_failure(tool_call, result)
if failure_type not in self.failure_patterns: self.failure_patterns[failure_type] = { "count": 0, "examples": [] }
self.failure_patterns[failure_type]["count"] += 1 if len(self.failure_patterns[failure_type]["examples"]) < 5: self.failure_patterns[failure_type]["examples"].append({ "tool_call": tool_call, "result": result })
# Generate improvement suggestions if self.failure_patterns[failure_type]["count"] >= 10: suggestion = generate_improvement_suggestion(failure_type, self.failure_patterns[failure_type]["examples"]) self.improvement_suggestions.append(suggestion)
def get_insights(self): return { "usage_stats": self.usage_stats, "common_failures": sorted( self.failure_patterns.items(), key=lambda x: x[1]["count"], reverse=True ), "improvement_suggestions": self.improvement_suggestions }We continue to explore: