Regex对于自定义的人工输入可能很难看,因为我们是人,所以我们键入不同的东西。如果可能,请在存储数据之前尝试清理数据。
也许将所有内容都设为小写,替换
rm
具有
room
。删除字符串中的任何非字母数字字符。我肯定会在做regex之前先做这件事。
import re
def clean_data(text):
text = text.lower()
text = text.replace("rm", "room")
return re.sub(r'[^a-zA-Z0-9 ]', '', text)
pattern = r'\b(room|rm)\b\s*([#]?|number|no.?|[\w/()]*\w)\s*([a-zA-Z0-9-]+)\b'
text = ['room b7', 'rm 2', 'not a room fsd', 'rm no 4', 'room a12', 'Room 12', 'Roomd25', 'room D25', 'ROOM D25', 'ROOM C11', 'room 17', 'room A4B', 'room 101', 'rm 17', 'room 37/39', 'ROOM B1', 'room C 29 from', 'room C23/25/27', 'room 16b/18a/18b', 'and other sentences that should be left intact like Clean the room now AVOID.']
ny_text = [clean_data(t) for t in text]
for t in ny_text:
match = re.findall(pattern, t, re.IGNORECASE)
if not match:
continue
room_alias = match[0][0]
room_number = match[0][2]
print(f"Found: {room_alias} {room_number}")