AgentOccam/webagents_step/utils/llm.py
2025-01-22 11:32:35 -08:00

422 lines
17 KiB
Python

import openai
import re
import copy
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
AutoModelForSeq2SeqLM,
)
import ctranslate2
from time import sleep
import tiktoken
from openai import OpenAI
import warnings
warnings.simplefilter("ignore")
input_token_cost_usd_by_model = {
"gpt-4-1106-preview": 0.01 / 1000,
"gpt-4": 0.03 / 1000,
"gpt-4-32k": 0.06 / 1000,
"gpt-3.5-turbo": 0.001 / 1000,
"gpt-3.5-turbo-instruct": 0.0015 / 1000,
"gpt-3.5-turbo-16k": 0.003 / 1000,
"babbage-002": 0.0016 / 1000,
"davinci-002": 0.012 / 1000,
"ada-v2": 0.0001 / 1000,
}
output_token_cost_usd_by_model = {
"gpt-4-1106-preview": 0.03 / 1000,
"gpt-4": 0.06 / 1000,
"gpt-4-32k": 0.12 / 1000,
"gpt-3.5-turbo": 0.002 / 1000,
"gpt-3.5-turbo-instruct": 0.002 / 1000,
"gpt-3.5-turbo-16k": 0.004 / 1000,
"babbage-002": 0.0016 / 1000,
"davinci-002": 0.012 / 1000,
"ada-v2": 0.0001 / 1000,
}
def fill_prompt_template(prompt_template, objective, observation, url, previous_history):
prompt = copy.deepcopy(prompt_template)
prompt["input"] = prompt["input"].replace("{objective}", objective)
if isinstance(observation, dict):
prompt["input"] = prompt["input"].replace("{observation}", observation["text"])
else:
prompt["input"] = prompt["input"].replace("{observation}", observation)
prompt["input"] = prompt["input"].replace("{url}", url)
prompt["input"] = prompt["input"].replace("{previous_actions}", previous_history)
return prompt
def filter_quotes_if_matches_template(action):
if action is None:
return None
# Regex pattern to match the entire 'type [X] ["Y"]' template, allowing for Y to be digits as well
pattern = r'^type \[\d+\] \["([^"\[\]]+)"\]$'
# Check if the action matches the specific template
match = re.match(pattern, action)
if match:
# Extract the matched part that needs to be unquoted
y_part = match.group(1)
# Reconstruct the action string without quotes around Y
filtered_action = f'type [{match.group(0).split("[")[1].split("]")[0]}] [{y_part}]'
return filtered_action.strip() # filtered_action.split("\n")[0].strip()
else:
# Return the original action if it doesn't match the template
return action.strip() # action.split("\n")[0].strip()
def parse_action_reason(model_response):
reason_match = re.search(r'REASON:\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL)
reason = reason_match.group(1) if reason_match else None
# action_match = re.search(r'ACTION:\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL)
action_match = re.search(r'(?:ACTION|ACTIONS):\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL)
action = action_match.group(1) if action_match else None
action = filter_quotes_if_matches_template(action)
return action, reason
def construct_llm_message_hf(prompt, prompt_mode, model_type="llama2"):
if model_type == "llama2":
instruction = "<s>[INST] " + prompt["instruction"]
else:
instruction = prompt["instruction"]
messages = [{"role": "system", "content": instruction}]
if prompt["examples"]:
messages.append({"role": "system", "content": "Here are a few examples:"})
for example in prompt["examples"]:
messages.append({"role": "system", "content": f"\n### Input:\n{example['input']}\n\n### Response:\n{example['response']}"})
if model_type == "llama2":
query = f"\nHere is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n[/INST]\n"
else:
query = f"\nHere is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"
messages.append({"role": "user", "content": query})
if prompt_mode == "chat":
return messages
elif prompt_mode == "completion":
all_content = ''.join(message['content'] for message in messages)
messages_completion = [{"role": "user", "content": all_content}]
return messages_completion
def construct_llm_message_anthropic(prompt, plan_list=None, action_list=None):
if plan_list and action_list:
import os
from global_utils import CURRENT_DIR
assert len(plan_list) > 0 and len(action_list) > 0
plan_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "plan_instructions", f"{p}.txt"), "r").readlines()) for p in plan_list])
action_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "action_instructions", f"{a}.txt"), "r").readlines()) for a in action_list])
prompt["instruction"] = prompt["instruction"].replace("{plan_instructions}", plan_instructions)
prompt["instruction"] = prompt["instruction"].replace("{action_instructions}", action_instructions)
system_message = prompt["instruction"]
if prompt["examples"]:
system_message += f"\n\n## Here are a few examples:"
for i, example in enumerate(prompt["examples"]):
example_input = example["input"]
example_response = example["response"]
if "example_format" in prompt.keys():
system_message += "\n\n"
system_message += prompt["example_format"].replace("{i}", i).replace("{example_input}", example_input).replace("{example_response}", example_response)
else:
system_message += f"\n\n| Example {i}\n\n### Input:\n{example_input}\n\n### Response: Let's think step by step.\n{example_response}"
if "input_format" in prompt.keys():
if "{visual_observation}" in prompt.keys():
from claude import arrange_message_for_claude
text = prompt["input_format"].replace("{input}", prompt['input'])
text_prior, text_subsequent = text.split("{visual_observation}")
messages = arrange_message_for_claude([("text", text_prior), ("image", prompt["{visual_observation}"]), ("text", text_subsequent)])
else:
messages = [{"role": "user", "content": [{"type": "text", "text": prompt["input_format"].replace("{input}", prompt['input'])}]}]
else:
if "{visual_observation}" in prompt.keys():
pass
else:
messages = [{"role": "user", "content": [{"type": "text", "text": f"## Here is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"}]}]
return system_message, messages
def construct_llm_message_openai(prompt, prompt_mode, plan_list=None, action_list=None):
if not (plan_list and action_list):
messages = [{"role": "system", "content": prompt["instruction"]}]
if prompt["examples"]:
messages.append({"role": "system", "content": "Here are a few examples:"})
for example in prompt["examples"]:
messages.append({"role": "system", "content": f"\n### Input:\n{example['input']}\n\n### Response:\n{example['response']}"})
messages.append({"role": "user", "content": f"Here is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"})
if prompt_mode == "chat":
return messages
elif prompt_mode == "completion":
all_content = ''.join(message['content'] for message in messages)
messages_completion = [{"role": "user", "content": all_content}]
return messages_completion
import os
from global_utils import CURRENT_DIR
assert len(plan_list) > 0 and len(action_list) > 0
plan_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "plan_instructions", f"{p}.txt"), "r").readlines()) for p in plan_list])
action_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "action_instructions", f"{a}.txt"), "r").readlines()) for a in action_list])
prompt["instruction"] = prompt["instruction"].replace("{plan_instructions}", plan_instructions)
prompt["instruction"] = prompt["instruction"].replace("{action_instructions}", action_instructions)
messages = [{"role": "system", "content": prompt["instruction"]}]
if prompt["examples"]:
messages.append({"role": "system", "content": "## Here are a few examples:"})
for i, example in enumerate(prompt["examples"]):
example_input = example["input"]
example_response = example["response"]
messages.append({"role": "system", "content": f"| Example {i}\n\n### Input:\n{example_input}\n\n### Response: Let's think step by step.\n{example_response}"})
if "input_format" in prompt.keys():
messages.append({"role": "user", "content": prompt["input_format"].replace("{input}", prompt['input'])})
else:
messages.append({"role": "user", "content": f"## Here is the current Input. Please respond with PLAN, REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"})
if prompt_mode == "chat":
return messages
elif prompt_mode == "completion":
all_content = ''.join(message['content'] for message in messages)
messages_completion = [{"role": "user", "content": all_content}]
return messages_completion
def call_anthropic_llm(system_message, messages, model="anthropic.claude-3-haiku-20240307-v1:0", **model_kwargs):
# Use the native inference API to send a text message to Anthropic Claude.
import boto3
import json
# Create a Bedrock Runtime client in the AWS Region of your choice.
client = boto3.client("bedrock-runtime", region_name="us-east-1")
print(system_message, file=open("trash.txt", "a"))
print("\n".join(item["content"][0]["text"] for item in messages), end="\n"+"#"*100+"\n", file=open("trash.txt", "a"))
native_request = {
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": 512,
"temperature": 0.5,
"system": system_message,
"messages": messages,
}
# Convert the native request to JSON.
request = json.dumps(native_request)
try:
# Invoke the model with the request.
response = client.invoke_model(modelId=model, body=request)
except Exception as e:
raise KeyError(f"ERROR: Can't invoke '{model}'. Reason: {e}")
# Decode the response body.
model_response = json.loads(response["body"].read())
# Extract and print the response text.
response_text = model_response["content"][0]["text"]
return response_text
def call_openai_llm(messages, model="gpt-3.5-turbo", **model_kwargs):
"""
Sends a request with a chat conversation to OpenAI's chat API and returns a response.
Parameters:
messages (list)
A list of dictionaries containing the messages to send to the chatbot.
model (str)
The model to use for the chatbot. Default is "gpt-3.5-turbo".
temperature (float)
The temperature to use for the chatbot. Defaults to 0. Note that a temperature
of 0 does not guarantee the same response (https://platform.openai.com/docs/models/gpt-3-5).
Returns:
response (Optional[dict])
The response from OpenAI's chat API, if any.
"""
# client = OpenAI()
temperature = model_kwargs.get('temperature', 0.7)
top_p = model_kwargs.get('top_p', 1.0)
n = model_kwargs.get('n', 1)
for m in messages:
print(m["content"], file=open("trash.txt", "a"))
print("*"*100, file=open("trash.txt", "a"))
num_attempts = 0
while True:
if num_attempts >= 10:
raise ValueError("OpenAI request failed.")
try:
if model=="text-davinci-003":
response = openai.Completion.create(
model=model,
prompt=messages[0]["content"],
temperature=temperature,
top_p=top_p,
n=n,
max_tokens=128)
return response.choices[0].text.strip()
response = OpenAI().chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
top_p=top_p,
n=n
)
return response.choices[0].message.content.strip()
except openai.AuthenticationError as e:
print(e)
return None
except openai.RateLimitError as e:
print(e)
print("Sleeping for 10 seconds...")
sleep(10)
num_attempts += 1
except Exception as e:
print(e)
print("Sleeping for 10 seconds...")
sleep(10)
num_attempts += 1
def get_num_tokens(text: str, model_name: str) -> int:
tokenizer = tiktoken.encoding_for_model(model_name=model_name)
return len(tokenizer.encode_ordinary(text))
def calculate_cost_openai(messages: str, response: str, model_name: str) -> int:
input_text = " ".join([msg["content"] for msg in messages])
num_input_tokens = get_num_tokens(input_text, model_name)
num_output_tokens = get_num_tokens(response, model_name)
input_token_cost = input_token_cost_usd_by_model.get(model_name, None)
output_token_cost = output_token_cost_usd_by_model.get(model_name, None)
if input_token_cost is None or output_token_cost is None:
print(f"[calculate_cost_openai] unknown model {model_name}")
return 0
return num_input_tokens * input_token_cost + num_output_tokens * output_token_cost
def load_tokenizer(mpath, context_size):
tokenizer = AutoTokenizer.from_pretrained(mpath, return_token_type_ids=False)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.model_max_length = context_size
# tokenizer.padding_side = "right"
# tokenizer.truncation_side = "left"
# tokenizer.add_eos_token = True
return tokenizer
def load_model(mpath, dtype, device="cuda", context_len=4096, is_seq2seq=False, ct2_mpath=None):
if is_seq2seq:
model_loader = AutoModelForSeq2SeqLM
else:
model_loader = AutoModelForCausalLM
if dtype == "bf16":
model = model_loader.from_pretrained(
mpath,
max_position_embeddings=context_len,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
device_map="balanced_low_0",
)
elif dtype == "4bit":
model = model_loader.from_pretrained(
mpath,
max_position_embeddings=context_len,
low_cpu_mem_usage=True,
load_in_4bit=True,
device_map="auto",
)
elif dtype == "4bit-optimized":
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = model_loader.from_pretrained(
mpath,
use_cache=True,
device_map="auto",
quantization_config=bnb_config,
max_position_embeddings=context_len,
)
elif dtype == "8bit":
model = model_loader.from_pretrained(
mpath,
max_position_embeddings=context_len,
low_cpu_mem_usage=True,
load_in_8bit=True,
device_map="auto",
)
elif dtype == "ct2":
assert ct2_mpath is not None
model = ctranslate2.Generator(ct2_mpath, device=device)
return model
# @torch.no_grad()
# def generate_prediction(
# inputs,
# model,
# tokenizer,
# max_new_tokens,
# is_seq2seq=False,
# **kwargs,
# # num_beams,
# # do_sample,
# # no_repeat_ngram_size,
# # temperature,
# # top_k,
# # top_p,
# ):
# input_ids = tokenizer(inputs, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length - max_new_tokens).input_ids
# outputs = model.generate(
# input_ids=input_ids.cuda(),
# max_new_tokens=max_new_tokens,
# **kwargs,
# ).cpu()
# torch.cuda.empty_cache()
# if not is_seq2seq:
# outputs = outputs[:, input_ids.shape[1] :]
# prediction = [
# p.split(tokenizer.pad_token, 1)[0]
# for p in tokenizer.batch_decode(outputs, skip_special_tokens=True)
# ][0].strip()
# return prediction
def generate_prediction(
inputs,
model,
tokenizer,
**kwargs,
):
inputs = tokenizer([inputs], return_tensors='pt', truncation=True, add_special_tokens=False).to(model.device)
# if torch.cuda.is_available():
# inputs = inputs.to('cuda')
outputs = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
**kwargs,
)
outputs = outputs[:, inputs.input_ids.shape[1] :]
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
return prediction