我正在尝试制作一个Python脚本,该脚本遍历我们的一些项目文档(.pdf),并将状态/名称从R更新为F。原始文本是用Helvetica粗体书写的,但新文本没有粗体。我尝试了网上能找到的所有选择,但都无济于事。
请注意,我不是程序员,所以这是通过ChatGPT完成的,请不要解雇我:D
import os
import shutil
import fitz # PyMuPDF
import re
def rename_and_copy_files():
base_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'original')
updated_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'updated')
if not os.path.exists(updated_directory):
os.makedirs(updated_directory)
font_path = "Helvetica-Bold.ttf"
if not os.path.isfile(font_path):
print(f"Font file not found: {font_path}")
return
for filename in os.listdir(base_directory):
if '_R_' in filename:
new_filename = filename.replace('_R_', '_F_')
src = os.path.join(base_directory, filename)
dst = os.path.join(updated_directory, new_filename)
print(f"Processing file: {filename}")
if filename.endswith('.pdf'):
update_pdf_text(src, dst, font_path)
else:
shutil.copy2(src, dst)
print(f"Copied and renamed: {filename} to {new_filename}")
def update_pdf_text(src, dst, font_path):
document = fitz.open(src)
for page_num in range(len(document)):
page = document[page_num]
text_instances = page.search_for("_R_")
for inst in text_instances:
rect = fitz.Rect(inst)
full_text, start_rect, end_rect = extract_full_name(page, rect)
if not full_text:
continue
updated_text = full_text.replace('_R_', '_F_')
page.draw_rect(fitz.Rect(start_rect.x0, start_rect.y0, end_rect.x1, end_rect.y1), color=(1, 1, 1),
fill=(1, 1, 1))
new_y = start_rect.y0 + 10
page.insert_text((start_rect.x0, new_y),
updated_text,
fontsize=10,
fontfile=font_path,
color=(0, 0, 0))
single_r_instances = page.search_for(" R ")
for inst in single_r_instances:
rect = fitz.Rect(inst)
page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
new_y = rect.y0 + 10
page.insert_text((rect.x0, new_y),
"F",
fontsize=10,
fontfile=font_path,
color=(0, 0, 0))
document.save(dst, garbage=4, deflate=True)
document.close()
def extract_full_name(page, rect):
full_text = ""
start_rect = rect
end_rect = rect
words = page.get_text("words")
name_pattern = re.compile(r'[A-Za-z0-9_\-]+')
for word in words:
word_text = word[4]
if rect.intersects(fitz.Rect(word[:4])) and name_pattern.match(word_text):
start_rect = fitz.Rect(word[:4]) if fitz.Rect(word[:4]).x0 < start_rect.x0 else start_rect
end_rect = fitz.Rect(word[:4]) if fitz.Rect(word[:4]).x1 > end_rect.x1 else end_rect
full_text += word_text
return full_text, start_rect, end_rect
if __name__ == "__main__":
rename_and_copy_files()
print("Process finished")
我尝试过使用不同版本的Helvetica bold(.otf和.ttf),也尝试过使用其他字体(Comic sans只是为了看看它是否会被选中,但没有)。没有字体文件,程序什么也不做,所以它必须意识到它的存在。。。只是不使用它。
我猜问题就在这一部分:
page.insert_text((rect.x0, new_y),
"F",
fontsize=10,
fontfile=font_path,
color=(0, 0, 0))
然而,我已经没有想法了。任何帮助都将不胜感激,谢谢!:)