| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- import datetime
- import re
- from string import punctuation
- def parser_text(text):
- data = {
- "title": {
- "serial": "",
- "number": "",
- "first_name": "",
- "last_name": "",
- "patronymic": "",
- "birthday": "",
- "issue_date": "",
- "profession": "",
- "education": ""
- }
- }
- serial_match = re.search(r'№\s*(\d+)', text)
- if serial_match:
- data["title"]["number"] = serial_match.group(1)
- last_name_match = re.search(r'\bфамилия[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
- text_rows = text.split('\n')
- number_index = list(filter(lambda x: '№' in x, text_rows))
- if not number_index:
- next_index = 3
- else:
- next_index = text_rows.index(number_index[0])
- if last_name_match:
- data["title"]["last_name"] = last_name_match.group(1)
- else:
- if number_index:
- number_index = next_index
- for i in range(1, 4):
- row = text_rows[number_index + i].split()
- row_copy = row.copy()
- if len(row) < 2:
- continue
- row.sort(key=lambda x: (row_copy.index(x), len(x)))
- row = list(filter(lambda x: len(x) > 3, row))
- if row:
- if len(row[-1]) > 3:
- data["title"]["last_name"] = row[-1]
- next_index = number_index + i
- break
- first_name_match = re.search(r'\bимя[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
- if first_name_match:
- data["title"]["first_name"] = first_name_match.group(1)
- else:
- if number_index:
- number_index = next_index
- for i in range(1, 4):
- row = text_rows[number_index + i].split()
- row_copy = row.copy()
- if len(row) < 2:
- continue
- row.sort(key=lambda x: (row_copy.index(x), len(x)))
- row = list(filter(lambda x: len(x) > 3, row))
- if row:
- if len(row[-1]) > 3:
- data["title"]["first_name"] = row[-1]
- next_index = number_index + i
- break
- patronymic_match = re.search(r'\bотчество[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
- if patronymic_match:
- data["title"]["patronymic"] = patronymic_match.group(1)
- else:
- if number_index:
- number_index = next_index
- for i in range(1, 4):
- row = text_rows[number_index + i].split()
- row_copy = row.copy()
- if len(row) < 2:
- continue
- row.sort(key=lambda x: (row_copy.index(x), len(x)))
- row = list(filter(lambda x: len(x) > 3, row))
- if row:
- if len(row[-1]) > 3:
- data["title"]["patronymic"] = row[-1]
- next_index = number_index + i
- break
- patronymic_match = re.search(r'\bние[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
- if patronymic_match:
- data["title"]["education"] = patronymic_match.group(1)
- else:
- if number_index:
- number_index = next_index
- for i in range(1, 4):
- row = text_rows[number_index + i].split()
- row_copy = row.copy()
- if len(row) < 2:
- continue
- row.sort(key=lambda x: (row_copy.index(x), len(x)))
- row = list(filter(lambda x: len(x) > 3, row))
- if row:
- if len(row[-1]) > 3:
- data["title"]["education"] = row[-1]
- next_index = number_index + i
- break
- patronymic_match = re.search(r'\bость[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
- if patronymic_match:
- data["title"]["profession"] = patronymic_match.group(1)
- else:
- if number_index:
- number_index = next_index
- for i in range(1, 4):
- row = text_rows[number_index + i].split()
- row_copy = row.copy()
- if len(row) < 2:
- continue
- row.sort(key=lambda x: (row_copy.index(x), len(x)))
- row = list(filter(lambda x: len(x) > 3, row))
- if row:
- if len(row[-1]) > 3:
- data["title"]["profession"] = row[-1]
- next_index = number_index + i
- break
- birthday_match = re.search(r'\bния[:\s]+([0-3]?[0-9] [а-яё]+ \d{4})', text, re.IGNORECASE)
- if birthday_match is None:
- birthday_match = re.search(r'(\d{2}\s*\.\s*\d{2}\s*\.\s*\d{4})', text, re.IGNORECASE)
- if birthday_match:
- try:
- data["title"]["birthday"] = datetime.datetime.strptime(birthday_match.group(1), '%d %B %Y').strftime(
- '%Y-%m-%d')
- except ValueError:
- pass
- for key in data['title'].keys():
- if key not in ['birthday', 'issue_date']:
- data['title'][key] = replace_punctuation_and_lower(data['title'][key])
- all_word = []
- for word_row in [words.split() for words in text_rows]:
- for word in word_row:
- if len(word) > 3:
- all_word.append(word)
- return data, all_word
- def replace_punctuation_and_lower(text):
- for symbol in punctuation + '`‘':
- text = text.replace(symbol, '')
- return text.lower().capitalize()
|