parse_text.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import datetime
  2. import re
  3. from string import punctuation
  4. def parser_text(text):
  5. data = {
  6. "title": {
  7. "serial": "",
  8. "number": "",
  9. "first_name": "",
  10. "last_name": "",
  11. "patronymic": "",
  12. "birthday": "",
  13. "issue_date": "",
  14. "profession": "",
  15. "education": ""
  16. }
  17. }
  18. serial_match = re.search(r'№\s*(\d+)', text)
  19. if serial_match:
  20. data["title"]["number"] = serial_match.group(1)
  21. last_name_match = re.search(r'\bфамилия[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
  22. text_rows = text.split('\n')
  23. number_index = list(filter(lambda x: '№' in x, text_rows))
  24. if not number_index:
  25. next_index = 3
  26. else:
  27. next_index = text_rows.index(number_index[0])
  28. if last_name_match:
  29. data["title"]["last_name"] = last_name_match.group(1)
  30. else:
  31. if number_index:
  32. number_index = next_index
  33. for i in range(1, 4):
  34. row = text_rows[number_index + i].split()
  35. row_copy = row.copy()
  36. if len(row) < 2:
  37. continue
  38. row.sort(key=lambda x: (row_copy.index(x), len(x)))
  39. row = list(filter(lambda x: len(x) > 3, row))
  40. if row:
  41. if len(row[-1]) > 3:
  42. data["title"]["last_name"] = row[-1]
  43. next_index = number_index + i
  44. break
  45. first_name_match = re.search(r'\bимя[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
  46. if first_name_match:
  47. data["title"]["first_name"] = first_name_match.group(1)
  48. else:
  49. if number_index:
  50. number_index = next_index
  51. for i in range(1, 4):
  52. row = text_rows[number_index + i].split()
  53. row_copy = row.copy()
  54. if len(row) < 2:
  55. continue
  56. row.sort(key=lambda x: (row_copy.index(x), len(x)))
  57. row = list(filter(lambda x: len(x) > 3, row))
  58. if row:
  59. if len(row[-1]) > 3:
  60. data["title"]["first_name"] = row[-1]
  61. next_index = number_index + i
  62. break
  63. patronymic_match = re.search(r'\bотчество[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
  64. if patronymic_match:
  65. data["title"]["patronymic"] = patronymic_match.group(1)
  66. else:
  67. if number_index:
  68. number_index = next_index
  69. for i in range(1, 4):
  70. row = text_rows[number_index + i].split()
  71. row_copy = row.copy()
  72. if len(row) < 2:
  73. continue
  74. row.sort(key=lambda x: (row_copy.index(x), len(x)))
  75. row = list(filter(lambda x: len(x) > 3, row))
  76. if row:
  77. if len(row[-1]) > 3:
  78. data["title"]["patronymic"] = row[-1]
  79. next_index = number_index + i
  80. break
  81. patronymic_match = re.search(r'\bние[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
  82. if patronymic_match:
  83. data["title"]["education"] = patronymic_match.group(1)
  84. else:
  85. if number_index:
  86. number_index = next_index
  87. for i in range(1, 4):
  88. row = text_rows[number_index + i].split()
  89. row_copy = row.copy()
  90. if len(row) < 2:
  91. continue
  92. row.sort(key=lambda x: (row_copy.index(x), len(x)))
  93. row = list(filter(lambda x: len(x) > 3, row))
  94. if row:
  95. if len(row[-1]) > 3:
  96. data["title"]["education"] = row[-1]
  97. next_index = number_index + i
  98. break
  99. patronymic_match = re.search(r'\bость[:\s]+([А-ЯЁа-яё]+)', text, re.IGNORECASE)
  100. if patronymic_match:
  101. data["title"]["profession"] = patronymic_match.group(1)
  102. else:
  103. if number_index:
  104. number_index = next_index
  105. for i in range(1, 4):
  106. row = text_rows[number_index + i].split()
  107. row_copy = row.copy()
  108. if len(row) < 2:
  109. continue
  110. row.sort(key=lambda x: (row_copy.index(x), len(x)))
  111. row = list(filter(lambda x: len(x) > 3, row))
  112. if row:
  113. if len(row[-1]) > 3:
  114. data["title"]["profession"] = row[-1]
  115. next_index = number_index + i
  116. break
  117. birthday_match = re.search(r'\bния[:\s]+([0-3]?[0-9] [а-яё]+ \d{4})', text, re.IGNORECASE)
  118. if birthday_match is None:
  119. birthday_match = re.search(r'(\d{2}\s*\.\s*\d{2}\s*\.\s*\d{4})', text, re.IGNORECASE)
  120. if birthday_match:
  121. try:
  122. data["title"]["birthday"] = datetime.datetime.strptime(birthday_match.group(1), '%d %B %Y').strftime(
  123. '%Y-%m-%d')
  124. except ValueError:
  125. pass
  126. for key in data['title'].keys():
  127. if key not in ['birthday', 'issue_date']:
  128. data['title'][key] = replace_punctuation_and_lower(data['title'][key])
  129. all_word = []
  130. for word_row in [words.split() for words in text_rows]:
  131. for word in word_row:
  132. if len(word) > 3:
  133. all_word.append(word)
  134. return data, all_word
  135. def replace_punctuation_and_lower(text):
  136. for symbol in punctuation + '`‘':
  137. text = text.replace(symbol, '')
  138. return text.lower().capitalize()