webrlvr/agent_toolcall/run_qwen_mysql_agent_auto.py
2025-06-11 17:30:06 +08:00

610 lines
38 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# run_qwen_mysql_agent.py
import os
import json
import re
import logging
import argparse
from qwen_agent.agents import Assistant
from qwen_agent.utils.output_beautify import typewriter_print
import subprocess
from datetime import datetime
from load_dotenv import load_dotenv
import random
load_dotenv()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
parser = argparse.ArgumentParser(description="Run Qwen MySQL Agent to generate and verify QA items.")
parser.add_argument('--iterations', type=int, default=10, help='Number of generation loops to run.')
args = parser.parse_args()
DIFFICULTY = "Easy"
# DIFFICULTY = "Medium"
# DIFFICULTY = "Hard"
GENERATED_QA_FILE = "generated_qa.jsonl"
qa_history = []
next_qa_id = 1
previous_questions = []
if os.path.exists(GENERATED_QA_FILE):
logging.info(f"Loading previous QA items from {GENERATED_QA_FILE}...")
with open(GENERATED_QA_FILE, 'r', encoding='utf-8') as f:
for line in f:
try:
item = json.loads(line)
qa_history.append(item)
# The 'final_question' is nested inside 'qa_item'
if 'qa_item' in item and 'final_question' in item['qa_item']:
previous_questions.append(item['qa_item']['final_question'])
except json.JSONDecodeError:
logging.warning(f"Could not parse line in {GENERATED_QA_FILE}: {line.strip()}")
if qa_history:
# Find the max id and set the next id
max_id = max(item.get('id', 0) for item in qa_history)
next_qa_id = max_id + 1
PREVIOUS_GENERATED_TASKS = "\n".join(previous_questions)
logging.info(f"Loaded {len(qa_history)} previous QA items. Next ID is {next_qa_id}.")
if PREVIOUS_GENERATED_TASKS:
logging.info(f"Providing {len(previous_questions)} previous questions for context.")
api_key = os.getenv('OPENAI_API_KEY')
base_url = os.getenv('OPENAI_BASE_URL')
# 1. 选定 LLM示例用 DashScope 云端,替换为你自己的即可)
llm_cfg = {
# 'model': 'qwen3-8b', # 不能tool call
'model': 'qwen3-235b-a22b', # ! 这个可以
# 'model': 'qwen3-30b-a3b', # 不能tool call
# 'model': 'qwen-plus-latest', # 没有thinking
# 'model': 'qwen-turbo-latest', # 没有thinking
# Use the endpoint provided by Alibaba Model Studio:
# 'model_type': 'qwen_dashscope',
# 'api_key': os.getenv('DASHSCOPE_API_KEY'),
# Use a custom endpoint compatible with OpenAI API:
'model_server': base_url,
'api_key': api_key,
# Other parameters:
# 'generate_cfg': {
# # Add: When the response content is `<think>this is the thought</think>this is the answer;
# # Do not add: When the response has been separated by reasoning_content and content.
# 'thought_in_content': True,
# },
}
# 2. 描述可用工具 —— 这里挂载刚启动的 MySQLMCP Server
# https://www.modelscope.cn/mcp/servers/@designcomputer/mysql_mcp_server
tools = [{
"mcpServers": {
"mysql": {
"command": "uv",
"args": [
"--directory",
"/home/ubuntu/.mcp",
"run",
"mysql_mcp_server"
],
"env": {
"MYSQL_HOST": "localhost",
"MYSQL_PORT": "23306",
"MYSQL_USER": "mcpuser",
"MYSQL_PASSWORD": "StrongPass123!",
"MYSQL_DATABASE": "magentodb"
}
}
}
}]
# 3. 创建智能体
bot = Assistant(
llm=llm_cfg,
function_list=tools,
)
prompt14_template = """
You are an expert Magento 2 database analyst, Python programmer, and an autonomous agent. **You ARE equipped with a tool (e.g., 'mysql_mcp_server_tool_call') to interact directly with a MySQL database. You MUST use this tool for all database queries to simulate data gathering. Your answers and derived values will be based SOLELY on the data retrieved via this tool from the live database.** Your current task is to generate ONE comprehensive Question-Answer (QA) item related to the Magento 2 database.
**1. Core Principles: Feasibility, Difficulty, and Diversity**
**A. Principle 1: Web Agent Feasibility**
**Crucially, every question you generate MUST be answerable by a web agent interacting with a standard Magento admin panel.** The agent does not have direct database access. Its capabilities are limited to what a human user can do through a web browser:
* **Searching:** Using search bars on product, order, or customer pages.
* **Filtering:** Applying filters to grids (e.g., filter orders by status 'Processing').
* **Sorting:** Sorting columns in a grid (e.g., sort customers by 'Lifetime Sales' to find the top one).
* **Navigation & Reading:** Clicking into a specific item's detail/edit page and reading a value.
* **Counting from Grids:** Reading the total count of items after applying a filter (e.g., "Showing 1-20 of **45** items").
**Avoid questions that require complex, database-only operations.**
* **BAD (Not Web-Feasible):** `"What is the average number of items per order?"` - No single page in the admin panel calculates and displays this value.
* **GOOD (Web-Feasible):** `"How many orders currently have the status 'pending'?"` - An agent can navigate to the Sales > Orders grid, apply a filter, and read the total count.
**A.1. CRITICAL RULE: Rephrasing Yes/No Questions into Information Extraction Tasks**
Questions that can be answered with a simple "Yes" or "No" are **STRICTLY FORBIDDEN**. They encourage guessing and do not effectively test the agent's ability to extract specific information. You **MUST** reframe any binary check into a question that retrieves a state or a value.
* **INSTEAD OF (FORBIDDEN):** `"Is the product with SKU 'MSH03' enabled?"`
* **DO THIS (REQUIRED):** `"What is the enable status for the product with SKU 'MSH03'?"`
* *(Expected `llm_derived_answer`: "Enabled" or "Disabled")*
* **INSTEAD OF (FORBIDDEN):** `"Is the product 'Strive Shoulder Pack' in stock?"`
* **DO THIS (REQUIRED):** `"What is the stock status for the product 'Strive Shoulder Pack'?"`
* *(Expected `llm_derived_answer`: "In Stock" or "Out of Stock")*
* **INSTEAD OF (FORBIDDEN):** `"Is the product with SKU 'MSH03-36-Blue' currently enabled and in stock?"`
* **DO THIS (REQUIRED):** `"What are the enable status and stock status for the product with SKU 'MSH03-36-Blue'?"`
* *(Expected `llm_derived_answer`: "The product is Enabled and In Stock.")*
**Consequence for Validation:** As a result of this rule, the literal strings "Yes" and "No" are considered **invalid** values for the `expected_value` field within your `validation_rules`. You must validate against the actual state word (e.g., "Enabled", "In Stock", "Complete").
**B. Principle 2: Task Difficulty Levels**
You must generate a task that aligns with the specified `{DIFFICULTY}` level. The difficulty is determined by the complexity of the workflow the web agent must execute.
* **### Easy**
* **Definition:** Tasks that can be completed in a **single step or on a single page** with a simple action.
* **Typical Actions:** Applying a single filter to a grid, performing a direct search for a known item, or reading a clearly visible value on a main page.
* **Web-Feasible Example:** "How many orders currently have the status 'pending'?"
* **Agent Workflow:** Navigate to the Sales > Orders grid, apply one filter ("Status: Pending"), and read the total record count displayed on the page.
* **### Medium**
* **Definition:** Tasks that require a **sequence of 2-4 distinct, linear steps**. This often involves navigating from a list/grid view to a detail/edit view.
* **Typical Actions:** Searching for an item then opening it, applying multiple filters, or finding an item in one grid and using that info to look something up on its detail page.
* **Web-Feasible Example:** "What is the shipping address for the order with the increment ID '000000123'?"
* **Agent Workflow:** 1. Navigate to the Sales > Orders grid. 2. Search for '000000123'. 3. Click on the order to open its detail page. 4. Locate and read the shipping address block.
* **### Hard**
* **Definition:** Tasks that require **complex logic, data comparison/synthesis across different pages, or looping through items**. The agent cannot rely on a simple, linear sequence of clicks.
* **Typical Actions:** Finding information on one page to use as a filter on another, comparing values across multiple items manually, or tasks where the UI doesn't natively support the required sorting/filtering combination.
* **Web-Feasible Example:** "What is the name of the most expensive product within the 'Tops' category?"
* **Agent Workflow (is complex):** 1. Navigate to the Products grid. 2. Filter by "Category: Tops". 3. The grid likely cannot be sorted by final price directly in this view. The agent would need to iterate through the filtered product list, potentially clicking into *each product's page* to find its price, store it, and compare it against the others to find the maximum. This looping and comparison makes it hard.
**C. Principle 3: Dynamic Entity Selection for True Diversity**
**To prevent generating repetitive questions using the same few examples, you MUST dynamically source the key entity for your question from the live database.** This is a mandatory first step in your process.
* **Problem to Avoid:** Using SKUs, order numbers, or customer emails directly from the illustrative `TABLE_SAMPLES_CONTENT`.
* **Required Action:** Before formulating your question, you **MUST** perform an initial, exploratory query to fetch a list of random, valid identifiers from the database.
* **Example Exploratory Queries (using `ORDER BY RAND()`):**
* To get random product SKUs: `SELECT sku FROM catalog_product_entity ORDER BY RAND() LIMIT 10;`
* To get random order increment IDs: `SELECT increment_id FROM sales_order ORDER BY RAND() LIMIT 10;`
* To get random customer emails: `SELECT email FROM customer_entity ORDER BY RAND() LIMIT 10;`
* **Rule:** After fetching this list via your tool, you **MUST** select ONE entity which you NEVER MET before from the returned results to use as the subject of your `final_question`.
**D. Principle 4: Avoiding Repetition of Previous Tasks**
**You will be provided with a list of previously generated questions. Your primary goal is to create a new task that is fundamentally different in its core logic and agent workflow.**
* **List of Previous Tasks for Reference:**
--- START OF PREVIOUSLY GENERATED TASKS ---
{PREVIOUS_GENERATED_TASKS}
--- END OF PREVIOUSLY GENERATED TASKS ---
* **Definition of Repetition (to be AVOIDED):**
* Simply changing the entity (e.g., asking for the stock of SKU 'B' instead of SKU 'A').
* Asking for the same information about a different entity type (e.g., asking for a customer's creation date instead of an order's creation date).
* Minor variations in filtering (e.g., asking for 'processing' orders instead of 'pending' orders).
* **Required Action:**
1. **Analyze the PREVIOUS_GENERATED_TASKS list.** Identify the core agent workflows and question patterns that have already been used (e.g., "find item by X and read property Y", "filter grid by Z and count results").
2. **Innovate a new task.** Your new question must introduce a new combination of actions, a different sequence of steps, or query a different aspect of the system that has not been explored in the previous tasks.
3. **Self-Correction:** If your initial idea feels too similar to a previous task, you MUST discard it and formulate a new, more distinct one. Narrate this in your thought process: "My initial idea to ask for a product's price is too similar to the previous task about a product's stock. I will instead create a task about finding all products within a certain price range."
**E. Principle 5: Adherence to Valid Data Timeframe**
**All questions involving dates or time periods MUST be scoped within the years 2022 and 2023.** This is the known valid data range for the database.
* **Problem to Avoid:** Using relative timeframes that could fall outside the valid data range.
* **FORBIDDEN Examples:** `"How many new customers were created in the last 7 days?"`, `"List all orders from the last year."`, `"What was the total revenue in May 2024?"`
* **REQUIRED Examples:** `"What was the total revenue generated in the month of May 2023?"`, `"How many new customer accounts were created between January 1, 2022, and March 31, 2022?"`, `"List all products that were updated during the fourth quarter of 2022."`
You **MUST** ensure any date-based question uses specific, absolute dates or date ranges that fall squarely within 2022 or 2023.
**2. Contextual & Inspirational Information**
* **Database Schema:**
--- START OF DATABASE SCHEMA ---
{MAGENTO_SCHEMA_CONTENT}
--- END OF DATABASE SCHEMA ---
* **Key Magento Schema Characteristics & EAV Model (Essential for query design):**
* **EAV (Entity-Attribute-Value):** Many entities (products, categories, customers) use EAV. Your SQL will need to join `eav_attribute` (to find `attribute_id` from `attribute_code`) with the correct value table (e.g., `catalog_product_entity_varchar`, `_int`).
* **Store Scopes:** Data can be global (store_id=0 or admin), website-specific, or store-view-specific. Queries for attributes often need to specify `store_id = 0` for admin/default values.
* **Order Workflow:** `quote` (cart) -> `sales_order` -> `sales_invoice`, `sales_shipment`, `sales_creditmemo`.
* **Flat/Grid Tables:** Tables like `sales_order_grid` and `customer_grid_flat` are excellent indicators of what data is likely available in an admin grid for a web agent to see, filter, and sort.
* **Question Diversity Inspiration (Themes for Web-Feasible Tasks)**
* **A. Ranking & Sorting:** "Which customer has the highest Lifetime Sales value?"
* **B. Aggregation via Filtered Count:** "What is the total number of orders with the status 'complete'?"
* **C. Temporal / Date-Based Filtering:** "How many new customer accounts were created in October 2023?"
* **D. Conditional Filtering & Property Checks:** "Find all 'simple' products that are currently out of stock."
* **E. Existence & Specific Lookups:** "Is the product with SKU '[Dynamically Selected SKU]' currently enabled?"
* **F. EAV Attribute Lookups:** "What is the customer's Group for the user with email '[Dynamically Selected Email]'?"
**3. Your Task: Generate ONE QA Item of `{DIFFICULTY}` difficulty**
Follow these phases meticulously:
**Phase A: Question Formulation & Iterative Data Collection (USING YOUR DATABASE TOOL)**
1. **Analyze Previous Tasks & Innovate (MANDATORY FIRST STEP):**
* Review the `{PREVIOUS_GENERATED_TASKS}` list to understand existing task patterns.
* **State your analysis:** "I have reviewed the previous tasks. I see patterns like [describe a pattern]. To avoid repetition, I will create a new task that involves [describe the novel workflow/logic]."
2. **Dynamic Entity Selection (MANDATORY SECOND STEP):**
* If your novel question idea requires a specific entity, perform an exploratory query to fetch a list of random, valid identifiers. You **MUST** use a method like `ORDER BY RAND() LIMIT 10` for this.
* **State your plan:** "For my novel question, I need a random entity. I will execute using my tool: `[Your exploratory SQL query]`".
* **Process Tool Results:** "My tool returned: `[...]`. I will select '[Chosen Entity]'."
3. **Formulate an `initial_question` (string):**
* **CRITICAL:** Now, using the entity you selected in the previous step, formulate a question.
* The question's complexity MUST match the specified `{DIFFICULTY}` level. Use the definitions and examples in Section 1B for guidance.
* The question must be **strictly feasible for a web agent**.
* Choose a theme from the "Question Diversity Inspiration" section.
* **Special Instructions for Ranking Questions (MECE Principle MUST be followed):** If you choose the "Ranking & Sorting" theme, particularly for "most/highest/top" questions, you **MUST** follow these additional rigorous rules to ensure the answer is **Mutually Exclusive, Collectively Exhaustive (MECE)**.
* **Problem to Solve:** A simple `ORDER BY ... LIMIT 1` query is UNRELIABLE and FORBIDDEN as the final logic, as it can miss items that are tied for the top rank.
* **Mandatory Iterative Verification Process:**
1. **Initial Probe Query:** First, execute an exploratory query with a moderate limit (e.g., `... ORDER BY value DESC LIMIT 10`).
2. **Analyze and Verify:**
* **If the results are NOT all tied:** You can be confident in the top result(s).
* **If ALL results from the initial probe are tied:** You **MUST** assume the answer is incomplete. This indicates a potential tie boundary issue. You **MUST** then execute a second, more robust query to find the complete set of all tied items. This can be done in two ways:
* (Method A - Iterative) Re-run the query with a much larger limit (e.g., `LIMIT 100`) to find where the tie breaks.
* (Method B - Definitive, **Strongly Preferred**) Execute a window function query to programmatically isolate *all* items in the top rank, for example: `SELECT ... FROM (SELECT ..., DENSE_RANK() OVER (ORDER BY ranking_column DESC) as dr FROM ...) ranked_items WHERE dr = 1;`.
3. **Self-Correction:** If your second query reveals more tied items than the first, you **MUST** update your understanding and base your final answer on this complete, verified set of data. Your thought process must narrate this: "My initial query with `LIMIT 10` showed all items tied at $99. This is inconclusive. I will now run a `DENSE_RANK()` query to find all items with rank 1 to ensure a MECE answer."
* **Rank Existence:** If your initial idea is to ask for a specific rank (e.g., "the second most expensive"), and your verification queries reveal this rank is skipped due to ties, you **MUST adjust your `final_question`** to ask for a rank that definitely exists.
4. **Iterative SQL Execution and Refinement (to find the answer):**
* **Plan & Execute SQL with Tool:** Formulate the query needed to find the answer.
* **Tool Response Length Constraint:** Data-gathering queries **MUST include a `LIMIT` clause** (e.g., `LIMIT 50`).
* **State your plan:** "To answer, I will now execute...".
* **Process Tool Results:** "My tool returned...".
* **Analyze & Refine:** Examine the actual tool data. Refine your question into a `final_question` that is perfectly aligned with the data, web agent feasibility, the required difficulty, and the ranking/tie rules.
* **CRITICAL RULE for Handling Empty Results:** If your tool returns an empty result set (e.g., `[]`), you **MUST** trust this as the ground truth. This means the entity or condition you queried for does not exist in the database. You **MUST NOT** invent data or change your query to find something else unless the initial question itself was flawed. Your subsequent steps (Answer Derivation) **MUST** reflect this "not found" status.
**Phase B: Answer Derivation (from ACTUAL Tool Results)**
1. Based on the **verified and complete data collected from the live database via your tool**, formulate an **`llm_derived_answer`** (string). This is the concise, factual answer to your `final_question`.
* **Handling "Not Found" Scenarios:** If your iterative data collection in Phase A definitively concluded that the requested information does not exist (i.e., your tool returned an empty result), your `llm_derived_answer` **MUST** be a clear, standardized indicator of absence. Use one of the following exact phrases: **"Not available"** or **"N/A"**. Do not create sentences like "The product could not be found" or "There is no data." Simply provide the standardized answer.
**Phase C: Validation Rule Design & Difficulty Rationale**
1. **Design `validation_rules`:**
* A **list of rule objects**, each with `type` and `expected_value`.
* **`type`:** Primarily `"must_include"` or `"fuzzy_match"`.
* **`expected_value`:** The specific value *extracted* from your answer, derived **directly from your tool's results**.
* **Focus on key entities/values**. Multiple rules imply AND.
2. **Formulate a `difficulty_reason`:** A concise explanation of why the task matches the difficulty, referencing the agent's workflow.
**Phase D: Reference SQL Formulation**
1. Select or compose a **single, concise `reference_sql` (string)** that represents the ground truth for the question. This SQL is for verification and does not need a `LIMIT`.
* **For ranking questions involving "most/highest/top":** The `reference_sql` **MUST** be the single, definitive query that programmatically guarantees a **Mutually Exclusive, Collectively Exhaustive (MECE)** result. It must return *all* items tied for the top rank and no others. The use of a window function (e.g., `DENSE_RANK() ... WHERE dr = 1`) is the **ideal and strongly preferred format** for this reference SQL, as it perfectly embodies the required logic. A simple `ORDER BY ... LIMIT N` query is **unacceptable** for this field in a ranking context with potential ties.
**4. Final Output Format (for this ONE QA item):**
Provide your response strictly as a single JSON object. Do not include any explanatory text outside of this JSON structure.
```json
{{
"final_question": "Your refined, novel, web-agent-feasible question string.",
"llm_derived_answer": "The natural language answer derived exclusively from actual database data obtained via your tool.",
"validation_rules": [
{{
"type": "must_include",
"expected_value": "Specific key value/entity extracted from the answer, based on actual database data."
}}
],
"reference_sql": "A single, representative SELECT SQL query that finds the ground truth for the question.",
"difficulty_reason": "A concise explanation of why the task's complexity matches the specified difficulty level, based on the web agent's required workflow."
}}
```
**5. Agent Workflow and Self-Correction Reminder:**
Your thought process should explicitly narrate steps like:
1. **"Step 1: Analyze Previous Tasks.** I have reviewed the `{PREVIOUS_GENERATED_TASKS}`. The pattern of 'find a single item and read one of its direct properties' is common. To innovate, I will create a task that requires filtering a grid and then performing an action on the *entire set* of results, like finding a common attribute among them."
2. **"Step 2: Dynamic Entity Selection (if needed).** My new idea needs a category name. I'll execute `SELECT ... FROM catalog_category_entity_varchar ...` to get one."
3. **"Step 3: Question Formulation.** I will now formulate my novel, `{DIFFICULTY}` question..."
4. **"Step 4: Answering the Question.** To find the answer, I will execute a query..."
5. **"Step 5: Deriving the Final Output.** My database tool returned... The reason this is `{DIFFICULTY}` is because..."
Please proceed with generating one QA item according to these **strict, complete, and highly detailed** instructions.
"""
# Main generation loop
for i in range(args.iterations):
logging.info(f"--- Starting Iteration {i + 1}/{args.iterations} ---")
# 从三挡难度中随机DIFFICULTY
# DIFFICULTY = random.choice(["Easy", "Medium", "Hard"])
DIFFICULTY = "Easy"
MAGENTO_SCHEMA_CONTENT = ""
try:
script_dir = os.path.dirname(os.path.abspath(__file__))
schema_file_path = os.path.join(script_dir, "intent_group_schema.json")
with open(schema_file_path, "r", encoding="utf-8") as f:
intent_groups = json.load(f)
selected_group = random.choice(intent_groups)
intent_group_name = selected_group['intent_group_name']
intent_group_description = selected_group['description']
intent_group_examples = selected_group['intent_examples']
logging.info(f"Randomly selected intent group: {intent_group_name}")
schema_parts = [
f"Intent Group: {intent_group_name}",
f"Description: {intent_group_description}",
f"Intent Examples: {intent_group_examples}"
]
for table in selected_group['tables']:
schema_parts.append(f"-- Table: {table['table_name']}\n-- Description: {table['description']}\n{table['schema']}")
MAGENTO_SCHEMA_CONTENT = "\n\n".join(schema_parts)
except FileNotFoundError:
logging.error(f"intent_group_schema.json not found. Exiting.")
exit(1)
except Exception as e:
logging.error(f"Error loading intent schema file: {e}. Exiting.")
exit(1)
PREVIOUS_GENERATED_TASKS = "\n".join(previous_questions)
if PREVIOUS_GENERATED_TASKS:
logging.info(f"Providing {len(previous_questions)} previous questions for context.")
prompt_content = prompt14_template.format(
DIFFICULTY=DIFFICULTY,
PREVIOUS_GENERATED_TASKS=PREVIOUS_GENERATED_TASKS,
MAGENTO_SCHEMA_CONTENT=MAGENTO_SCHEMA_CONTENT
)
# 4. 运行示例 —— 用自然语言问数据库
messages = [{
"role": "user",
"content": prompt_content
}]
response_plain_text = ''
responses = []
try:
for responses_chunk in bot.run(messages=messages, stream=True):
# stream=True 将逐步打印 LLM 思考与结果
responses = responses_chunk
response_plain_text = typewriter_print(responses, response_plain_text)
except Exception as e:
logging.error(f"An error occurred during bot execution: {e}")
continue # Skip to the next iteration
# The final QA item json is in the last response
if not responses:
logging.warning("Bot returned no response. Skipping iteration.")
continue
qa_item_str = responses[-1]["content"]
logging.info("\n--- Generated QA Item ---")
logging.info(qa_item_str)
generator_tool_calls = re.findall(r'\[TOOL_CALL\].*?(?=\[THINK\])', response_plain_text, re.DOTALL)
# 新增:统计 [TOOL_RESPONSE] 子串出现的次数并打印
# 如果次数为0意味着实际没有调用工具得到的答案肯定是错的
tool_response_count = response_plain_text.count("[TOOL_RESPONSE]")
logging.info(f"[INFO] [TOOL_RESPONSE] was observed {tool_response_count} time(s) in the generation phase.")
# --- Start of Verification Logic ---
if tool_response_count == 0:
logging.warning("\n[VERIFICATION] SKIPPED: No tool calls were made during generation, the result is likely invalid.")
else:
logging.info("\n[VERIFICATION] STARTING: Tool calls were observed, proceeding with verification.")
# 1. Parse the generated QA item
qa_item = None
try:
# Clean up the string: find the JSON block, which might be wrapped in markdown
match = re.search(r'\{.*\}', qa_item_str, re.DOTALL)
if match:
json_str = match.group(0)
qa_item = json.loads(json_str)
else:
# Fallback for when the string is just the JSON without wrappers
qa_item = json.loads(qa_item_str)
final_question = qa_item.get("final_question")
llm_derived_answer = qa_item.get("llm_derived_answer")
reference_sql = qa_item.get("reference_sql")
if not all([final_question, llm_derived_answer, reference_sql]):
logging.error(
"[VERIFICATION] FAILED: The generated JSON is missing one or more required keys (final_question, llm_derived_answer, reference_sql).")
qa_item = None # Invalidate qa_item to skip next step
except (json.JSONDecodeError, AttributeError) as e:
logging.error(f"[VERIFICATION] FAILED: Could not parse the JSON response from the generator bot. Error: {e}")
qa_item = None # Invalidate qa_item to skip next step
if qa_item:
# 2. Create the verifier prompt
verifier_prompt_template2 = """
You are a meticulous and rule-based database query verifier. Your task is to verify the consistency between a user's question, a generated answer, and a reference SQL query. You are given a tool to execute SQL queries against the database.
**Your Goal:**
Assess whether the `llm_derived_answer` is a correct and faithful response to the `final_question`, based *exclusively* on the real-time results of executing the `reference_sql`.
**Core Principles:**
1. **Truth is the SQL Result:** Your judgment must be based *solely* on the data returned by your execution of the `reference_sql`. Do not use any external knowledge.
2. **Empty is a Valid Answer:** An empty result from the SQL query (`[]`) is a definitive and trustworthy outcome. It proves that no data matching the query's criteria exists.
* If the SQL result is empty and the `llm_derived_answer` correctly states that no information is available (e.g., "There are no results," "Not available," "N/A"), you **must** judge this as `CONSISTENT`.
* Conversely, if the SQL result is empty but the `llm_derived_answer` provides any specific information (e.g., "The product is 'Super Widget'"), this is a clear hallucination from the generator and you **must** judge it as `INCONSISTENT`.
**Input for Verification:**
1. **`final_question`**: The natural language question that was asked.
```
{final_question}
```
2. **`llm_derived_answer`**: The natural language answer that was generated.
```
{llm_derived_answer}
```
3. **`reference_sql`**: The SQL query intended to produce the data for the answer.
```sql
{reference_sql}
```
**Verification Steps:**
1. **Analyze the SQL:** Carefully examine the `reference_sql`, make sure it is feasible to answer the `final_question` with the SQL. If it is not feasible, you must judge it as `NONFEASIBLE_SQL` and skip to the end.
2. **Execute the SQL:** Use your database tool to execute the `reference_sql` exactly as provided.
3. **Analyze SQL Results:** Carefully examine the data returned by the query. Note the number of rows, the values in each column, and pay close attention to whether the result is empty.
4. **Compare and Contrast:** Critically compare the `SQL Results`, the `final_question`, and the `llm_derived_answer` based on the Core Principles.
* **Data Consistency:** Does the data in `llm_derived_answer` *exactly* match the data from your `SQL Results`? For example, if the answer mentions a count of "65", did your query actually return "65"? If the answer lists specific names or SKUs, are those the exact names/SKUs your query returned?
* **Question-Answer Alignment:** Does the `llm_derived_answer` truly answer the `final_question`?
* *Example of Mismatch:* The question asks for "product names," but the answer provides only "SKUs." Even if the SKUs are correct according to the SQL, this is an alignment failure.
* **Hallucination Check:** Does the `llm_derived_answer` contain information that is NOT supported by your `SQL Results`?
* *Example of Hallucination:* The answer lists several products, but your `SQL Results` are empty. This is a critical failure. **Remember Core Principle #2.**
**Final Output Format:**
Provide your response strictly as a single JSON object with two keys: `verification_result` and `verification_reason`.
* `verification_result` (string): Must be one of `CONSISTENT`, `INCONSISTENT`, or `ERROR_IN_SQL`.
* `CONSISTENT`: The answer is fully supported by the SQL results and correctly addresses the question. This includes cases where the SQL result is empty and the answer correctly states that no data is available.
* `INCONSISTENT`: There is a mismatch. This could be due to hallucinated data, incorrect values, or a failure to align with the question's intent.
* `ERROR_IN_SQL`: The `reference_sql` failed to execute due to a syntax error or other database error.
* `NONFEASIBLE_SQL`: The `reference_sql` is not feasible to answer the `final_question`.
* `verification_reason` (string): A clear, concise explanation for your conclusion. If inconsistent, explain exactly what the mismatch was. If the SQL failed, include the error message.
**Example 1 (Consistent):**
* `llm_derived_answer`: "There are 65 orders..."
* `SQL Result`: ['order_count': 65]
* Your Output:
```json
{{
"verification_result": "CONSISTENT",
"verification_reason": "The reference_sql executed successfully and returned a count of 65, which matches the llm_derived_answer."
}}
```
**Example 2 (Inconsistent - Hallucination):**
* `llm_derived_answer`: "The product is 'Super Widget'."
* `SQL Result`: `[]` (empty)
* Your Output:
```json
{{
"verification_result": "INCONSISTENT",
"verification_reason": "The llm_derived_answer states the product is 'Super Widget', but the reference_sql returned no results, proving no such product exists for the query. The answer is a hallucination."
}}
```
**Example 3 (Inconsistent - Alignment):**
* `final_question`: "What are the names of the top products?"
* `llm_derived_answer`: "The top product SKUs are 'WIDGET-001' and 'GADGET-002'."
* `SQL Result`: `['sku': 'WIDGET-001', 'sku': 'GADGET-002']`
* Your Output:
```json
{{
"verification_result": "INCONSISTENT",
"verification_reason": "The final_question asks for product names, but the llm_derived_answer and reference_sql only provide SKUs. The answer does not align with the question's requirement."
}}
```
**Example 4 (Consistent - Empty Result):**
* `final_question`: "What are the names of products from the brand 'NoBrand'?"
* `llm_derived_answer`: "No, there are no products available from the brand 'NoBrand'."
* `SQL Result`: `[]` (empty)
* Your Output:
```json
{{
"verification_result": "CONSISTENT",
"verification_reason": "The reference_sql executed successfully and returned an empty set, which confirms that no products from 'NoBrand' exist. The llm_derived_answer accurately reflects this fact."
}}
```
Now, perform the verification for the provided inputs.
"""
verifier_prompt = verifier_prompt_template2.format(
final_question=final_question,
llm_derived_answer=llm_derived_answer,
reference_sql=reference_sql
)
# 3. Create and run the verifier bot
verifier = Assistant(
llm=llm_cfg,
function_list=tools,
)
verifier_messages = [{"role": "user", "content": verifier_prompt}]
logging.info("\n--- Verifier Bot ---")
verifier_response_text = ''
verifier_responses = []
try:
for verifier_responses_chunk in verifier.run(messages=verifier_messages, stream=True):
verifier_responses = verifier_responses_chunk
verifier_response_text = typewriter_print(verifier_responses, verifier_response_text)
except Exception as e:
logging.error(f"An error occurred during verifier bot execution: {e}")
continue # Skip to the next iteration
verifier_tool_calls = re.findall(r'\[TOOL_CALL\].*?(?=\[THINK\])', verifier_response_text, re.DOTALL)
logging.info("\n--- Verification Result ---")
if not verifier_responses:
logging.warning("Verifier bot returned no response. Skipping verification.")
continue
verifier_output_str = verifier_responses[-1]["content"]
logging.info(verifier_output_str)
# 4. Parse verifier output and save if consistent
try:
# Clean up the string: find the JSON block, which might be wrapped in markdown
match = re.search(r'\{.*\}', verifier_output_str, re.DOTALL)
if match:
json_str = match.group(0)
verifier_result_json = json.loads(json_str)
else:
# Fallback for when the string is just the JSON without wrappers
verifier_result_json = json.loads(verifier_output_str)
if verifier_result_json.get("verification_result") == "CONSISTENT":
logging.info(f"\n[VERIFICATION] PASSED: Result is CONSISTENT. Saving to {GENERATED_QA_FILE}.")
combined_item = {
"id": next_qa_id,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"difficulty": DIFFICULTY,
"qa_item": qa_item,
"verification": verifier_result_json,
"generator_tool_calls": generator_tool_calls,
"verifier_tool_calls": verifier_tool_calls
}
with open(GENERATED_QA_FILE, 'a', encoding='utf-8') as f:
f.write(json.dumps(combined_item) + '\n')
logging.info(f"Successfully appended QA item #{next_qa_id} to {GENERATED_QA_FILE}.")
# Update state for the next iteration
previous_questions.append(qa_item['final_question'])
next_qa_id += 1
else:
result_type = verifier_result_json.get("verification_result", "UNKNOWN_RESULT")
reason = verifier_result_json.get('verification_reason', 'No reason provided.')
logging.warning(f"\n[VERIFICATION] FAILED: Result is '{result_type}'. Reason: {reason}. Not saving.")
except (json.JSONDecodeError, AttributeError) as e:
logging.error(f"\n[VERIFICATION] FAILED: Could not parse JSON from verifier bot output. Error: {e}")
logging.error(f"Verifier output was: {verifier_output_str}")
logging.info("All iterations completed.")