AddrSplitLmrMap.java 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. package com.skyversation.poiaddr.util;
  2. import com.skyversation.poiaddr.entity.AddrBean;
  3. import org.springframework.stereotype.Service;
  4. import javax.annotation.PostConstruct;
  5. import java.io.InputStream;
  6. import java.util.*;
  7. import java.util.regex.Matcher;
  8. import java.util.regex.Pattern;
  9. /**
  10. * # 生成完整的上海市县乡记录.xlsx
  11. * * 1、读取村居边界.geojson\得到对应关系【所属区:区代码:所属街:居委_1】,同时得到区和区代码的对应关系
  12. * * 2、遍历xlsx文件列表,然后解析returnAddress,进行分词
  13. * * 3、主要是获取街镇和居委的对应关系【判断到街镇后,得到下标,然后判断后面是否存在居委会或村委会关键字】
  14. * 当前版本:V2.0.1
  15. */
  16. @Service
  17. public class AddrSplitLmrMap {
  18. // 上海市村居边界geojson文件地址
  19. private static String All_no_SHFilePath = "全国省市县记录.xlsx";
  20. private static String outPutFilePath = "geojson/上海市_村居边界.xlsx";
  21. // 《区—街镇-居委》的对应关系
  22. private static HashMap<String, HashMap<String, Set<String>>> D_S_C_tree = new HashMap<>();
  23. // 非上海的《省-市-区》的对应关系
  24. private static HashMap<String, HashMap<String, Set<String>>> All_NO_SH_tree = new HashMap<>();
  25. // 区和区代码的对应关系
  26. private static HashMap<String, String> districtCodeMap = new HashMap<>();
  27. @PostConstruct
  28. private void initFile() {
  29. System.out.println("开始初始化分词器");
  30. InputStream is = ShanghaiAddressSplitUtil.class.getResourceAsStream(outPutFilePath);
  31. if (is == null) is = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + outPutFilePath);
  32. if (is == null) throw new RuntimeException("无法找到" + outPutFilePath);
  33. try {
  34. for (Map<String, Object> row : ExcelReaderUtils.readExcel(is)) {
  35. districtCodeMap.put(row.get("区").toString(), row.get("区代码").toString());
  36. if (D_S_C_tree.containsKey(row.get("区").toString())) {
  37. Map<String, Set<String>> SCT = D_S_C_tree.get(row.get("区").toString());
  38. if (SCT.containsKey(row.get("镇").toString())) {
  39. SCT.get(row.get("镇").toString()).add(row.get("居委").toString());
  40. } else {
  41. Set<String> CL = new HashSet<>();
  42. CL.add(row.get("居委").toString());
  43. SCT.put(row.get("镇").toString(), CL);
  44. }
  45. } else {
  46. HashMap<String, Set<String>> SCT = new HashMap<>();
  47. Set<String> CL = new HashSet<>();
  48. CL.add(row.get("居委").toString());
  49. SCT.put(row.get("镇").toString(), CL);
  50. D_S_C_tree.put(row.get("区").toString(), SCT);
  51. }
  52. }
  53. } catch (Exception e) {
  54. e.printStackTrace();
  55. }
  56. InputStream is2 = ShanghaiAddressSplitUtil.class.getResourceAsStream(All_no_SHFilePath);
  57. if (is2 == null) is2 = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + All_no_SHFilePath);
  58. if (is2 == null) throw new RuntimeException("无法找到" + All_no_SHFilePath);
  59. try {
  60. for (Map<String, Object> row : ExcelReaderUtils.readExcel(is2)) {
  61. String ss = row.get("省份").toString();
  62. String djs = row.get("地级市") != null && !row.get("地级市").toString().isEmpty() ? row.get("地级市").toString() : null;
  63. String xjs = row.get("县级市") != null && !row.get("县级市").toString().isEmpty() ? row.get("县级市").toString() : null;
  64. if (All_NO_SH_tree.containsKey(ss)) {
  65. Map<String, Set<String>> SCT = All_NO_SH_tree.get(ss);
  66. if (djs != null && xjs != null) {
  67. if (SCT.containsKey(djs)) {
  68. SCT.get(djs).add(xjs);
  69. } else {
  70. Set<String> CL = new HashSet<>();
  71. CL.add(xjs);
  72. SCT.put(djs, CL);
  73. }
  74. }
  75. } else {
  76. HashMap<String, Set<String>> SCT = new HashMap<>();
  77. if (djs != null && xjs != null) {
  78. Set<String> CL = new HashSet<>();
  79. CL.add(xjs);
  80. SCT.put(djs, CL);
  81. }
  82. All_NO_SH_tree.put(ss, SCT);
  83. }
  84. }
  85. } catch (Exception e) {
  86. e.printStackTrace();
  87. }
  88. }
  89. /**
  90. * ## 分词:返回实体类【原地址:省:市:区:镇:居委:路牌地址:标准地址:区代码】
  91. * * 1、初始化分词模型(读取村居边界.geojson\得到对应关系【所属区:区代码:所属街:居委_1】)(读取全国省市县记录.xlsx\得到对应关系【省:市:区县】)
  92. * * 2、判断地址字符串是否存在[省、市、区、镇、街道、县]
  93. * * 3、如果存在【省、市】判断是否是非上海市,是的话返回rule:0
  94. * * 4、不知道是不是上海市的话,判断【区、镇】(先全词匹配,匹配不到的话模糊匹配)
  95. * * 5、将匹配到的“区代码”拼接上310,否则直接是310000,作为搜索条件
  96. * * 6、得到返回结果列表
  97. * <p>
  98. * // 省1:市2:区4:镇8:居委16
  99. *
  100. * @param addr
  101. * @return
  102. */
  103. static Pattern pattern = Pattern.compile("市|区|镇|街道|县");
  104. static Pattern spattern = Pattern.compile("路|街|道|村");
  105. public static AddrBean outAddrMapInAddr(String addr) {
  106. AddrBean addrMap = new AddrBean();
  107. addrMap.setOldAddress(addr + "");
  108. addrMap.setAddress(addr.replaceAll(" ", ""));
  109. if (addr.contains("http")) {
  110. // 先判断是否是链接
  111. addrMap.setRule("-1");
  112. } else {
  113. // 判断外地省名
  114. boolean errorAddr = false;
  115. for (String s : All_NO_SH_tree.keySet()) {
  116. if (addr.startsWith(s) || (addr.startsWith(s.substring(0, 2)) && !ifTrueAddr(addr, s.substring(0, 2)))) {
  117. addrMap.setProvinces(s);
  118. addrMap.setRule("0");
  119. if (!s.contains("上海")) {
  120. errorAddr = true;
  121. }
  122. }
  123. // 判断外地市名
  124. for (String m : All_NO_SH_tree.get(s).keySet()) {
  125. if (addr.contains(m) && !addr.contains(m + "场")) {
  126. addrMap.setProvinces(s);
  127. addrMap.setMarket(m);
  128. addrMap.setRule("-2");
  129. }
  130. // 判断外地县名
  131. for (String x : All_NO_SH_tree.get(s).get(m)) {
  132. if (addr.contains(x) && !addr.contains(x + "场")) {
  133. addrMap.setDistinguish(x);
  134. addrMap.setProvinces(s);
  135. addrMap.setMarket(m);
  136. addrMap.setRule("-4");
  137. break;
  138. }
  139. }
  140. }
  141. }
  142. // 如果不是外地数据和连接数据的话
  143. if (!errorAddr) {
  144. // 上海地址匹配
  145. if (pattern.matcher(addr).find()) {
  146. if (addr.startsWith("上海")) {
  147. addrMap.setProvinces("上海市");
  148. addrMap.setMarket("上海市");
  149. addrMap.setRule("2");
  150. }
  151. // 匹配区
  152. boolean ifContains = false;
  153. // 区匹配标识
  154. String sh_distinguish = "";
  155. for (String d : D_S_C_tree.keySet()) {
  156. if (addr.contains(d) || addr.contains(d.substring(0, 2) + "县")) {
  157. ifContains = true;
  158. addrMap.setProvinces("上海市");
  159. addrMap.setMarket("上海市");
  160. addrMap.setDistinguish(d);
  161. sh_distinguish = d;
  162. addrMap.setRule("4");
  163. break;
  164. }
  165. if (addr.contains(d.substring(0, 2)) && ifTrueAddr(addr, d.substring(0, 2))) {
  166. addrMap.setProvinces("上海市");
  167. addrMap.setMarket("上海市");
  168. addrMap.setDistinguish(d);
  169. sh_distinguish = d;
  170. addrMap.setRule("4");
  171. }
  172. }
  173. // 镇匹配
  174. for (String d : D_S_C_tree.keySet()) {
  175. for (String s : D_S_C_tree.get(d).keySet()) {
  176. if (addr.contains(s)) {
  177. addrMap.setProvinces("上海市");
  178. addrMap.setMarket("上海市");
  179. addrMap.setDistinguish(d);
  180. addrMap.setStreetTown(s);
  181. addrMap.setRule("8");
  182. break;
  183. }
  184. if (addr.contains(s.substring(0, 2)) && ifContains && !sh_distinguish.isEmpty() && sh_distinguish.contains(d)) {
  185. addrMap.setProvinces("上海市");
  186. addrMap.setMarket("上海市");
  187. addrMap.setDistinguish(d);
  188. addrMap.setStreetTown(s);
  189. addrMap.setRule("8");
  190. }
  191. }
  192. }
  193. }
  194. }
  195. // 特殊处理逻辑
  196. if (addrMap.getDistinguish() != null && addrMap.getAddress() != null && addrMap.getDistinguish().contains("松江区") && addrMap.getAddress().contains("工业区")) {
  197. addrMap.setStreetTown("松江技术开发区");
  198. if (addrMap.getAddress().split("工业区").length > 1) {
  199. addrMap.setAddress(addrMap.getAddress().split("工业区")[1]);
  200. }
  201. }
  202. // 输出路牌
  203. if (addrMap.getProvinces() != null && !addrMap.getProvinces().isEmpty() && addrMap.getAddress().contains(addrMap.getProvinces())) {
  204. if (addrMap.getAddress().split(addrMap.getProvinces()).length > 1) {
  205. addrMap.setAddress(addrMap.getAddress().split(addrMap.getProvinces())[1]);
  206. }
  207. }
  208. if (addrMap.getMarket() != null && !addrMap.getMarket().isEmpty() && addrMap.getAddress().contains(addrMap.getMarket())) {
  209. if (addrMap.getAddress().split(addrMap.getMarket()).length > 1) {
  210. addrMap.setAddress(addrMap.getAddress().split(addrMap.getMarket())[1]);
  211. }
  212. }
  213. if (addrMap.getDistinguish() != null && !addrMap.getDistinguish().isEmpty()) {
  214. if (addrMap.getAddress().contains(addrMap.getDistinguish())) {
  215. if (addrMap.getAddress().split(addrMap.getDistinguish()).length > 1) {
  216. addrMap.setAddress(addrMap.getAddress().split(addrMap.getDistinguish())[1]);
  217. }
  218. } else if (addrMap.getAddress().contains(addrMap.getDistinguish().substring(0, 2) + "县")) {
  219. if (addrMap.getAddress().split(addrMap.getDistinguish().substring(0, 2) + "县").length > 1) {
  220. addrMap.setAddress(addrMap.getAddress().split(addrMap.getDistinguish().substring(0, 2) + "县")[1]);
  221. }
  222. }
  223. }
  224. if (addrMap.getStreetTown() != null && !addrMap.getStreetTown().isEmpty() && addrMap.getAddress().contains(addrMap.getStreetTown())) {
  225. if (addrMap.getAddress().split(addrMap.getStreetTown()).length > 1) {
  226. addrMap.setAddress(addrMap.getAddress().split(addrMap.getStreetTown())[1]);
  227. }
  228. }
  229. if (addrMap.getResidentialCommittee() != null && !addrMap.getResidentialCommittee().isEmpty() && addrMap.getAddress().contains(addrMap.getResidentialCommittee())) {
  230. if (addrMap.getAddress().split(addrMap.getResidentialCommittee()).length > 1) {
  231. addrMap.setAddress(addrMap.getAddress().split(addrMap.getResidentialCommittee())[1]);
  232. }
  233. }
  234. if (addrMap.getAddress().contains("委会") || addrMap.getAddress().contains("员会")) {
  235. addrMap.setAddress(addrMap.getAddress().substring(Math.max(addrMap.getAddress().indexOf("委会"), addrMap.getAddress().indexOf("员会")) + 2));
  236. }
  237. }
  238. return addrMap;
  239. }
  240. /**
  241. * 判断是否是名称+路名|街名的格式
  242. *
  243. * @param addr
  244. * @param tagStr
  245. * @return
  246. */
  247. public static boolean ifTrueAddr(String addr, String tagStr) {
  248. String endStr = addr.substring(addr.indexOf(tagStr) + tagStr.length(), Math.min(addr.length(), addr.indexOf(tagStr) + tagStr.length() + 2));
  249. if (spattern.matcher(endStr).find()) {
  250. return true;
  251. }
  252. return false;
  253. }
  254. /**
  255. * 解析中文地址为结构化列表
  256. *
  257. * @param address 原始地址字符串
  258. * @return 分词后的列表,按路、弄、号楼、层、室的顺序
  259. */
  260. public static List<String> parseAddress(String address) {
  261. List<String> result = new ArrayList<>(Arrays.asList(null, null, null, null, null));
  262. if (address == null || address.trim().isEmpty()) {
  263. return result;
  264. }
  265. address = address.trim();
  266. // 1. 提取路(简化版:直接匹配"弄"之前的所有字符)
  267. String roadPattern = "(.+?)[弄]";
  268. Matcher roadMatcher = Pattern.compile(roadPattern).matcher(address);
  269. if (roadMatcher.find() && roadMatcher.start() == 0) {
  270. result.set(0, roadMatcher.group(1));
  271. address = address.substring(roadMatcher.end() - 1); // 从"弄"之后开始截取
  272. } else {
  273. // 如果没有找到"弄",则尝试使用原来的路名匹配逻辑
  274. roadPattern = "(.+?[路街道路巷弄])";
  275. roadMatcher = Pattern.compile(roadPattern).matcher(address);
  276. if (roadMatcher.find() && roadMatcher.start() == 0) {
  277. result.set(0, roadMatcher.group(1));
  278. address = address.substring(roadMatcher.end());
  279. }
  280. }
  281. // 2. 提取弄(支持连续的弄和支弄)
  282. String lanePattern = "([0-9一二三四五六七八九十百千]+[弄支弄]+)";
  283. Matcher laneMatcher = Pattern.compile(lanePattern).matcher(address);
  284. StringBuilder laneBuilder = new StringBuilder();
  285. while (laneMatcher.find() && laneMatcher.start() == 0) {
  286. laneBuilder.append(laneMatcher.group(1));
  287. address = address.substring(laneMatcher.end());
  288. laneMatcher = Pattern.compile(lanePattern).matcher(address);
  289. }
  290. if (laneBuilder.length() > 0) {
  291. result.set(1, laneBuilder.toString());
  292. }
  293. // 3. 提取号楼(支持XX号格式)
  294. String buildingPattern = "([0-9一二三四五六七八九十百千]+[号楼栋号])";
  295. Matcher buildingMatcher = Pattern.compile(buildingPattern).matcher(address);
  296. if (buildingMatcher.find() && buildingMatcher.start() == 0) {
  297. result.set(2, buildingMatcher.group(1));
  298. address = address.substring(buildingMatcher.end());
  299. }
  300. // 智能楼层室号解析
  301. String roomPattern = "([0-9]{1,2})([0-9]{2,})[室房]"; // 修改正则表达式,确保室号部分至少两位数
  302. Matcher roomMatcher = Pattern.compile(roomPattern).matcher(address);
  303. if (roomMatcher.find()) {
  304. String floorPart = roomMatcher.group(1);
  305. String roomPart = roomMatcher.group(2);
  306. // 设置楼层
  307. result.set(3, floorPart + "层");
  308. // 设置室号(直接使用匹配到的部分,不去除前导零)
  309. result.set(4, floorPart + roomPart + "室"); // 修改此处,直接使用roomPart
  310. // 移除已匹配的部分
  311. address = address.substring(0, roomMatcher.start()) +
  312. address.substring(roomMatcher.end());
  313. } else {
  314. // 4. 提取层
  315. String floorPattern = "([0-9一二三四五六七八九十百千]+[层楼])";
  316. Matcher floorMatcher = Pattern.compile(floorPattern).matcher(address);
  317. if (floorMatcher.find()) {
  318. result.set(3, floorMatcher.group(1));
  319. address = address.substring(0, floorMatcher.start()) + address.substring(floorMatcher.end());
  320. }
  321. // 5. 提取室
  322. String roomPatternSimple = "([0-9]+[室房])";
  323. Matcher roomMatcherSimple = Pattern.compile(roomPatternSimple).matcher(address);
  324. if (roomMatcherSimple.find()) {
  325. result.set(4, roomMatcherSimple.group(1));
  326. }
  327. }
  328. return result;
  329. }
  330. public static void main(String[] args) {
  331. /*AddrSplitLmrMap AddrSplitLmrMap = new AddrSplitLmrMap();
  332. AddrSplitLmrMap.initFile();
  333. System.out.println(outAddrMapInAddr("村165号"));
  334. System.out.println(outAddrMapInAddr("上海市松江区乐都路"));
  335. System.out.println(outAddrMapInAddr("云南省昭通市昭阳区永丰镇绿荫社区居民委员会管湾村二十五组205号"));*/
  336. // 测试示例(包含所有典型场景)
  337. // 测试一位数楼层地址
  338. String address4 = "广富林1188弄167号313室";
  339. System.out.println("\n测试地址: " + address4);
  340. printParsedResult(parseAddress(address4));
  341. }
  342. private static void printParsedResult(List<String> parsed) {
  343. System.out.println("解析结果:");
  344. System.out.println("路: " + parsed.get(0));
  345. System.out.println("弄: " + parsed.get(1));
  346. System.out.println("号楼: " + parsed.get(2));
  347. System.out.println("层: " + parsed.get(3));
  348. System.out.println("室: " + parsed.get(4));
  349. }
  350. }