为了在LangChain中处理长输入文本,动态管理标记化工作,您可以通过考虑提示的长度和可能是输入一部分的任何其他固定文本来计算文本的可用标记。
KarolZmijewski的回答很好,但我想补充一点
process_dynamic_tokenization
额外功能以解决该问题。
from transformers import GPT2Tokenizer
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
llm_chain = LLMChain(
llm=OpenAI(model="gpt-3.5-turbo"),
prompt=PromptTemplate("Analyze the following text and summarize it: {text}")
)
def split_text_into_chunks(text, max_tokens):
tokens = tokenizer.tokenize(text)
chunks = []
for i in range(0, len(tokens), max_tokens):
chunk_tokens = tokens[i:i + max_tokens]
chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
chunks.append(chunk_text)
return chunks
def process_long_text(text, max_tokens):
chunks = split_text_into_chunks(text, max_tokens)
summaries = []
for chunk in chunks:
response = llm_chain.run(text=chunk)
summaries.append(response)
return " ".join(summaries)
def process_dynamic_tokenization(text, prompt_template, llm_chain, max_model_tokens=4096):
prompt_tokens = len(tokenizer.tokenize(prompt_template.template.format(text="")))
available_tokens = max_model_tokens - prompt_tokens
if available_tokens <= 0:
raise ValueError("Prompt template is too long for the model's token limit.")
return process_long_text(text, max_tokens=available_tokens)
long_text = "This is a very long document..."
final_summary = process_dynamic_tokenization(
text=long_text,
prompt_template=PromptTemplate("Analyze the following text and summarize it: {text}"),
llm_chain=llm_chain,
max_model_tokens=4096
)
print(final_summary)