422 lines
17 KiB
Python
422 lines
17 KiB
Python
import openai
|
|
import re
|
|
import copy
|
|
import torch
|
|
from transformers import (
|
|
AutoModelForCausalLM,
|
|
AutoTokenizer,
|
|
BitsAndBytesConfig,
|
|
AutoModelForSeq2SeqLM,
|
|
)
|
|
import ctranslate2
|
|
from time import sleep
|
|
import tiktoken
|
|
from openai import OpenAI
|
|
|
|
import warnings
|
|
warnings.simplefilter("ignore")
|
|
|
|
input_token_cost_usd_by_model = {
|
|
"gpt-4-1106-preview": 0.01 / 1000,
|
|
"gpt-4": 0.03 / 1000,
|
|
"gpt-4-32k": 0.06 / 1000,
|
|
"gpt-3.5-turbo": 0.001 / 1000,
|
|
"gpt-3.5-turbo-instruct": 0.0015 / 1000,
|
|
"gpt-3.5-turbo-16k": 0.003 / 1000,
|
|
"babbage-002": 0.0016 / 1000,
|
|
"davinci-002": 0.012 / 1000,
|
|
"ada-v2": 0.0001 / 1000,
|
|
}
|
|
|
|
output_token_cost_usd_by_model = {
|
|
"gpt-4-1106-preview": 0.03 / 1000,
|
|
"gpt-4": 0.06 / 1000,
|
|
"gpt-4-32k": 0.12 / 1000,
|
|
"gpt-3.5-turbo": 0.002 / 1000,
|
|
"gpt-3.5-turbo-instruct": 0.002 / 1000,
|
|
"gpt-3.5-turbo-16k": 0.004 / 1000,
|
|
"babbage-002": 0.0016 / 1000,
|
|
"davinci-002": 0.012 / 1000,
|
|
"ada-v2": 0.0001 / 1000,
|
|
}
|
|
|
|
def fill_prompt_template(prompt_template, objective, observation, url, previous_history):
|
|
prompt = copy.deepcopy(prompt_template)
|
|
prompt["input"] = prompt["input"].replace("{objective}", objective)
|
|
if isinstance(observation, dict):
|
|
prompt["input"] = prompt["input"].replace("{observation}", observation["text"])
|
|
else:
|
|
prompt["input"] = prompt["input"].replace("{observation}", observation)
|
|
prompt["input"] = prompt["input"].replace("{url}", url)
|
|
prompt["input"] = prompt["input"].replace("{previous_actions}", previous_history)
|
|
return prompt
|
|
|
|
def filter_quotes_if_matches_template(action):
|
|
if action is None:
|
|
return None
|
|
|
|
# Regex pattern to match the entire 'type [X] ["Y"]' template, allowing for Y to be digits as well
|
|
pattern = r'^type \[\d+\] \["([^"\[\]]+)"\]$'
|
|
# Check if the action matches the specific template
|
|
match = re.match(pattern, action)
|
|
if match:
|
|
# Extract the matched part that needs to be unquoted
|
|
y_part = match.group(1)
|
|
# Reconstruct the action string without quotes around Y
|
|
filtered_action = f'type [{match.group(0).split("[")[1].split("]")[0]}] [{y_part}]'
|
|
return filtered_action.strip() # filtered_action.split("\n")[0].strip()
|
|
else:
|
|
# Return the original action if it doesn't match the template
|
|
return action.strip() # action.split("\n")[0].strip()
|
|
|
|
def parse_action_reason(model_response):
|
|
reason_match = re.search(r'REASON:\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL)
|
|
reason = reason_match.group(1) if reason_match else None
|
|
|
|
# action_match = re.search(r'ACTION:\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL)
|
|
action_match = re.search(r'(?:ACTION|ACTIONS):\s*(.*?)\s*(?=\n[A-Z]|$)', model_response, re.DOTALL)
|
|
action = action_match.group(1) if action_match else None
|
|
|
|
action = filter_quotes_if_matches_template(action)
|
|
|
|
return action, reason
|
|
|
|
def construct_llm_message_hf(prompt, prompt_mode, model_type="llama2"):
|
|
if model_type == "llama2":
|
|
instruction = "<s>[INST] " + prompt["instruction"]
|
|
else:
|
|
instruction = prompt["instruction"]
|
|
|
|
messages = [{"role": "system", "content": instruction}]
|
|
|
|
if prompt["examples"]:
|
|
messages.append({"role": "system", "content": "Here are a few examples:"})
|
|
for example in prompt["examples"]:
|
|
messages.append({"role": "system", "content": f"\n### Input:\n{example['input']}\n\n### Response:\n{example['response']}"})
|
|
|
|
if model_type == "llama2":
|
|
query = f"\nHere is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n[/INST]\n"
|
|
else:
|
|
query = f"\nHere is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"
|
|
|
|
messages.append({"role": "user", "content": query})
|
|
if prompt_mode == "chat":
|
|
return messages
|
|
elif prompt_mode == "completion":
|
|
all_content = ''.join(message['content'] for message in messages)
|
|
messages_completion = [{"role": "user", "content": all_content}]
|
|
return messages_completion
|
|
|
|
def construct_llm_message_anthropic(prompt, plan_list=None, action_list=None):
|
|
if plan_list and action_list:
|
|
import os
|
|
from global_utils import CURRENT_DIR
|
|
|
|
assert len(plan_list) > 0 and len(action_list) > 0
|
|
plan_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "plan_instructions", f"{p}.txt"), "r").readlines()) for p in plan_list])
|
|
action_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "action_instructions", f"{a}.txt"), "r").readlines()) for a in action_list])
|
|
prompt["instruction"] = prompt["instruction"].replace("{plan_instructions}", plan_instructions)
|
|
prompt["instruction"] = prompt["instruction"].replace("{action_instructions}", action_instructions)
|
|
|
|
system_message = prompt["instruction"]
|
|
|
|
if prompt["examples"]:
|
|
system_message += f"\n\n## Here are a few examples:"
|
|
for i, example in enumerate(prompt["examples"]):
|
|
example_input = example["input"]
|
|
example_response = example["response"]
|
|
if "example_format" in prompt.keys():
|
|
system_message += "\n\n"
|
|
system_message += prompt["example_format"].replace("{i}", i).replace("{example_input}", example_input).replace("{example_response}", example_response)
|
|
else:
|
|
system_message += f"\n\n| Example {i}\n\n### Input:\n{example_input}\n\n### Response: Let's think step by step.\n{example_response}"
|
|
|
|
if "input_format" in prompt.keys():
|
|
if "{visual_observation}" in prompt.keys():
|
|
from claude import arrange_message_for_claude
|
|
text = prompt["input_format"].replace("{input}", prompt['input'])
|
|
text_prior, text_subsequent = text.split("{visual_observation}")
|
|
messages = arrange_message_for_claude([("text", text_prior), ("image", prompt["{visual_observation}"]), ("text", text_subsequent)])
|
|
else:
|
|
messages = [{"role": "user", "content": [{"type": "text", "text": prompt["input_format"].replace("{input}", prompt['input'])}]}]
|
|
else:
|
|
if "{visual_observation}" in prompt.keys():
|
|
pass
|
|
else:
|
|
messages = [{"role": "user", "content": [{"type": "text", "text": f"## Here is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"}]}]
|
|
|
|
return system_message, messages
|
|
|
|
def construct_llm_message_openai(prompt, prompt_mode, plan_list=None, action_list=None):
|
|
if not (plan_list and action_list):
|
|
messages = [{"role": "system", "content": prompt["instruction"]}]
|
|
|
|
if prompt["examples"]:
|
|
messages.append({"role": "system", "content": "Here are a few examples:"})
|
|
for example in prompt["examples"]:
|
|
messages.append({"role": "system", "content": f"\n### Input:\n{example['input']}\n\n### Response:\n{example['response']}"})
|
|
|
|
messages.append({"role": "user", "content": f"Here is the current Input. Please respond with REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"})
|
|
if prompt_mode == "chat":
|
|
return messages
|
|
elif prompt_mode == "completion":
|
|
all_content = ''.join(message['content'] for message in messages)
|
|
messages_completion = [{"role": "user", "content": all_content}]
|
|
return messages_completion
|
|
import os
|
|
from global_utils import CURRENT_DIR
|
|
|
|
assert len(plan_list) > 0 and len(action_list) > 0
|
|
plan_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "plan_instructions", f"{p}.txt"), "r").readlines()) for p in plan_list])
|
|
action_instructions = "\n".join(["".join(open(os.path.join(CURRENT_DIR, "prompts", "action_instructions", f"{a}.txt"), "r").readlines()) for a in action_list])
|
|
prompt["instruction"] = prompt["instruction"].replace("{plan_instructions}", plan_instructions)
|
|
prompt["instruction"] = prompt["instruction"].replace("{action_instructions}", action_instructions)
|
|
|
|
messages = [{"role": "system", "content": prompt["instruction"]}]
|
|
|
|
if prompt["examples"]:
|
|
messages.append({"role": "system", "content": "## Here are a few examples:"})
|
|
for i, example in enumerate(prompt["examples"]):
|
|
example_input = example["input"]
|
|
example_response = example["response"]
|
|
messages.append({"role": "system", "content": f"| Example {i}\n\n### Input:\n{example_input}\n\n### Response: Let's think step by step.\n{example_response}"})
|
|
|
|
if "input_format" in prompt.keys():
|
|
messages.append({"role": "user", "content": prompt["input_format"].replace("{input}", prompt['input'])})
|
|
else:
|
|
messages.append({"role": "user", "content": f"## Here is the current Input. Please respond with PLAN, REASON and ACTION.\n### Input:\n{prompt['input']}\n\n### Response:"})
|
|
if prompt_mode == "chat":
|
|
return messages
|
|
elif prompt_mode == "completion":
|
|
all_content = ''.join(message['content'] for message in messages)
|
|
messages_completion = [{"role": "user", "content": all_content}]
|
|
return messages_completion
|
|
|
|
def call_anthropic_llm(system_message, messages, model="anthropic.claude-3-haiku-20240307-v1:0", **model_kwargs):
|
|
# Use the native inference API to send a text message to Anthropic Claude.
|
|
|
|
import boto3
|
|
import json
|
|
|
|
# Create a Bedrock Runtime client in the AWS Region of your choice.
|
|
client = boto3.client("bedrock-runtime", region_name="us-east-1")
|
|
print(system_message, file=open("trash.txt", "a"))
|
|
print("\n".join(item["content"][0]["text"] for item in messages), end="\n"+"#"*100+"\n", file=open("trash.txt", "a"))
|
|
native_request = {
|
|
"anthropic_version": "bedrock-2023-05-31",
|
|
"max_tokens": 512,
|
|
"temperature": 0.5,
|
|
"system": system_message,
|
|
"messages": messages,
|
|
}
|
|
|
|
# Convert the native request to JSON.
|
|
request = json.dumps(native_request)
|
|
|
|
try:
|
|
# Invoke the model with the request.
|
|
response = client.invoke_model(modelId=model, body=request)
|
|
|
|
except Exception as e:
|
|
raise KeyError(f"ERROR: Can't invoke '{model}'. Reason: {e}")
|
|
|
|
# Decode the response body.
|
|
model_response = json.loads(response["body"].read())
|
|
|
|
# Extract and print the response text.
|
|
response_text = model_response["content"][0]["text"]
|
|
return response_text
|
|
|
|
def call_openai_llm(messages, model="gpt-3.5-turbo", **model_kwargs):
|
|
"""
|
|
Sends a request with a chat conversation to OpenAI's chat API and returns a response.
|
|
|
|
Parameters:
|
|
messages (list)
|
|
A list of dictionaries containing the messages to send to the chatbot.
|
|
model (str)
|
|
The model to use for the chatbot. Default is "gpt-3.5-turbo".
|
|
temperature (float)
|
|
The temperature to use for the chatbot. Defaults to 0. Note that a temperature
|
|
of 0 does not guarantee the same response (https://platform.openai.com/docs/models/gpt-3-5).
|
|
|
|
Returns:
|
|
response (Optional[dict])
|
|
The response from OpenAI's chat API, if any.
|
|
"""
|
|
# client = OpenAI()
|
|
temperature = model_kwargs.get('temperature', 0.7)
|
|
top_p = model_kwargs.get('top_p', 1.0)
|
|
n = model_kwargs.get('n', 1)
|
|
|
|
for m in messages:
|
|
print(m["content"], file=open("trash.txt", "a"))
|
|
print("*"*100, file=open("trash.txt", "a"))
|
|
|
|
num_attempts = 0
|
|
while True:
|
|
if num_attempts >= 10:
|
|
raise ValueError("OpenAI request failed.")
|
|
try:
|
|
if model=="text-davinci-003":
|
|
response = openai.Completion.create(
|
|
model=model,
|
|
prompt=messages[0]["content"],
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
n=n,
|
|
max_tokens=128)
|
|
return response.choices[0].text.strip()
|
|
|
|
response = OpenAI().chat.completions.create(
|
|
model=model,
|
|
messages=messages,
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
n=n
|
|
)
|
|
return response.choices[0].message.content.strip()
|
|
except openai.AuthenticationError as e:
|
|
print(e)
|
|
return None
|
|
except openai.RateLimitError as e:
|
|
print(e)
|
|
print("Sleeping for 10 seconds...")
|
|
sleep(10)
|
|
num_attempts += 1
|
|
except Exception as e:
|
|
print(e)
|
|
print("Sleeping for 10 seconds...")
|
|
sleep(10)
|
|
num_attempts += 1
|
|
|
|
def get_num_tokens(text: str, model_name: str) -> int:
|
|
tokenizer = tiktoken.encoding_for_model(model_name=model_name)
|
|
return len(tokenizer.encode_ordinary(text))
|
|
|
|
def calculate_cost_openai(messages: str, response: str, model_name: str) -> int:
|
|
input_text = " ".join([msg["content"] for msg in messages])
|
|
num_input_tokens = get_num_tokens(input_text, model_name)
|
|
num_output_tokens = get_num_tokens(response, model_name)
|
|
|
|
input_token_cost = input_token_cost_usd_by_model.get(model_name, None)
|
|
output_token_cost = output_token_cost_usd_by_model.get(model_name, None)
|
|
if input_token_cost is None or output_token_cost is None:
|
|
print(f"[calculate_cost_openai] unknown model {model_name}")
|
|
return 0
|
|
return num_input_tokens * input_token_cost + num_output_tokens * output_token_cost
|
|
|
|
def load_tokenizer(mpath, context_size):
|
|
tokenizer = AutoTokenizer.from_pretrained(mpath, return_token_type_ids=False)
|
|
# tokenizer.pad_token = tokenizer.eos_token
|
|
# tokenizer.pad_token_id = tokenizer.eos_token_id
|
|
# tokenizer.model_max_length = context_size
|
|
# tokenizer.padding_side = "right"
|
|
# tokenizer.truncation_side = "left"
|
|
# tokenizer.add_eos_token = True
|
|
return tokenizer
|
|
|
|
def load_model(mpath, dtype, device="cuda", context_len=4096, is_seq2seq=False, ct2_mpath=None):
|
|
if is_seq2seq:
|
|
model_loader = AutoModelForSeq2SeqLM
|
|
else:
|
|
model_loader = AutoModelForCausalLM
|
|
|
|
if dtype == "bf16":
|
|
model = model_loader.from_pretrained(
|
|
mpath,
|
|
max_position_embeddings=context_len,
|
|
low_cpu_mem_usage=True,
|
|
torch_dtype=torch.bfloat16,
|
|
device_map="balanced_low_0",
|
|
)
|
|
elif dtype == "4bit":
|
|
model = model_loader.from_pretrained(
|
|
mpath,
|
|
max_position_embeddings=context_len,
|
|
low_cpu_mem_usage=True,
|
|
load_in_4bit=True,
|
|
device_map="auto",
|
|
)
|
|
elif dtype == "4bit-optimized":
|
|
bnb_config = BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_use_double_quant=True,
|
|
bnb_4bit_quant_type="nf4",
|
|
bnb_4bit_compute_dtype=torch.bfloat16,
|
|
)
|
|
model = model_loader.from_pretrained(
|
|
mpath,
|
|
use_cache=True,
|
|
device_map="auto",
|
|
quantization_config=bnb_config,
|
|
max_position_embeddings=context_len,
|
|
)
|
|
elif dtype == "8bit":
|
|
model = model_loader.from_pretrained(
|
|
mpath,
|
|
max_position_embeddings=context_len,
|
|
low_cpu_mem_usage=True,
|
|
load_in_8bit=True,
|
|
device_map="auto",
|
|
)
|
|
elif dtype == "ct2":
|
|
assert ct2_mpath is not None
|
|
model = ctranslate2.Generator(ct2_mpath, device=device)
|
|
|
|
return model
|
|
|
|
# @torch.no_grad()
|
|
# def generate_prediction(
|
|
# inputs,
|
|
# model,
|
|
# tokenizer,
|
|
# max_new_tokens,
|
|
# is_seq2seq=False,
|
|
# **kwargs,
|
|
# # num_beams,
|
|
# # do_sample,
|
|
# # no_repeat_ngram_size,
|
|
# # temperature,
|
|
# # top_k,
|
|
# # top_p,
|
|
# ):
|
|
# input_ids = tokenizer(inputs, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length - max_new_tokens).input_ids
|
|
|
|
# outputs = model.generate(
|
|
# input_ids=input_ids.cuda(),
|
|
# max_new_tokens=max_new_tokens,
|
|
# **kwargs,
|
|
# ).cpu()
|
|
|
|
# torch.cuda.empty_cache()
|
|
# if not is_seq2seq:
|
|
# outputs = outputs[:, input_ids.shape[1] :]
|
|
|
|
# prediction = [
|
|
# p.split(tokenizer.pad_token, 1)[0]
|
|
# for p in tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
|
# ][0].strip()
|
|
|
|
# return prediction
|
|
|
|
def generate_prediction(
|
|
inputs,
|
|
model,
|
|
tokenizer,
|
|
**kwargs,
|
|
):
|
|
inputs = tokenizer([inputs], return_tensors='pt', truncation=True, add_special_tokens=False).to(model.device)
|
|
|
|
# if torch.cuda.is_available():
|
|
# inputs = inputs.to('cuda')
|
|
outputs = model.generate(
|
|
input_ids=inputs['input_ids'],
|
|
attention_mask=inputs['attention_mask'],
|
|
**kwargs,
|
|
)
|
|
|
|
outputs = outputs[:, inputs.input_ids.shape[1] :]
|
|
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
return prediction |