|
@@ -0,0 +1,274 @@
|
|
|
|
+package com.skyversation.poiaddr.util;
|
|
|
|
+
|
|
|
|
+import com.skyversation.poiaddr.entity.AddrBean;
|
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
|
+
|
|
|
|
+import javax.annotation.PostConstruct;
|
|
|
|
+import java.io.InputStream;
|
|
|
|
+import java.util.*;
|
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
+
|
|
|
|
+/**
|
|
|
|
+ * # 生成完整的上海市县乡记录.xlsx
|
|
|
|
+ * * 1、读取村居边界.geojson\得到对应关系【所属区:区代码:所属街:居委_1】,同时得到区和区代码的对应关系
|
|
|
|
+ * * 2、遍历xlsx文件列表,然后解析returnAddress,进行分词
|
|
|
|
+ * * 3、主要是获取街镇和居委的对应关系【判断到街镇后,得到下标,然后判断后面是否存在居委会或村委会关键字】
|
|
|
|
+ * 当前版本:V2.0.1
|
|
|
|
+ */
|
|
|
|
+@Service
|
|
|
|
+public class AddrSplitLmrMap {
|
|
|
|
+ // 上海市村居边界geojson文件地址
|
|
|
|
+ private static String All_no_SHFilePath = "全国省市县记录.xlsx";
|
|
|
|
+ private static String outPutFilePath = "geojson/上海市_村居边界.xlsx";
|
|
|
|
+
|
|
|
|
+ // 《区—街镇-居委》的对应关系
|
|
|
|
+ private static HashMap<String, HashMap<String, Set<String>>> D_S_C_tree = new HashMap<>();
|
|
|
|
+ // 非上海的《省-市-区》的对应关系
|
|
|
|
+ private static HashMap<String, HashMap<String, Set<String>>> All_NO_SH_tree = new HashMap<>();
|
|
|
|
+ // 区和区代码的对应关系
|
|
|
|
+ private static HashMap<String, String> districtCodeMap = new HashMap<>();
|
|
|
|
+
|
|
|
|
+ @PostConstruct
|
|
|
|
+ private void initFile() {
|
|
|
|
+ System.out.println("开始初始化分词器");
|
|
|
|
+ InputStream is = ShanghaiAddressSplitUtil.class.getResourceAsStream(outPutFilePath);
|
|
|
|
+ if (is == null) is = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + outPutFilePath);
|
|
|
|
+ if (is == null) throw new RuntimeException("无法找到" + outPutFilePath);
|
|
|
|
+ try {
|
|
|
|
+ for (Map<String, Object> row : ExcelReaderUtils.readExcel(is)) {
|
|
|
|
+ districtCodeMap.put(row.get("区").toString(), row.get("区代码").toString());
|
|
|
|
+ if (D_S_C_tree.containsKey(row.get("区").toString())) {
|
|
|
|
+ Map<String, Set<String>> SCT = D_S_C_tree.get(row.get("区").toString());
|
|
|
|
+ if (SCT.containsKey(row.get("镇").toString())) {
|
|
|
|
+ SCT.get(row.get("镇").toString()).add(row.get("居委").toString());
|
|
|
|
+ } else {
|
|
|
|
+ Set<String> CL = new HashSet<>();
|
|
|
|
+ CL.add(row.get("居委").toString());
|
|
|
|
+ SCT.put(row.get("镇").toString(), CL);
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ HashMap<String, Set<String>> SCT = new HashMap<>();
|
|
|
|
+ Set<String> CL = new HashSet<>();
|
|
|
|
+ CL.add(row.get("居委").toString());
|
|
|
|
+ SCT.put(row.get("镇").toString(), CL);
|
|
|
|
+ D_S_C_tree.put(row.get("区").toString(), SCT);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
+ e.printStackTrace();
|
|
|
|
+ }
|
|
|
|
+ InputStream is2 = ShanghaiAddressSplitUtil.class.getResourceAsStream(All_no_SHFilePath);
|
|
|
|
+ if (is2 == null) is2 = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + All_no_SHFilePath);
|
|
|
|
+ if (is2 == null) throw new RuntimeException("无法找到" + All_no_SHFilePath);
|
|
|
|
+ try {
|
|
|
|
+ for (Map<String, Object> row : ExcelReaderUtils.readExcel(is2)) {
|
|
|
|
+ String ss = row.get("省份").toString();
|
|
|
|
+ String djs = row.get("地级市") != null && !row.get("地级市").toString().isEmpty() ? row.get("地级市").toString() : null;
|
|
|
|
+ String xjs = row.get("县级市") != null && !row.get("县级市").toString().isEmpty() ? row.get("县级市").toString() : null;
|
|
|
|
+ if (All_NO_SH_tree.containsKey(ss)) {
|
|
|
|
+ Map<String, Set<String>> SCT = All_NO_SH_tree.get(ss);
|
|
|
|
+ if (djs != null && xjs != null) {
|
|
|
|
+ if (SCT.containsKey(djs)) {
|
|
|
|
+ SCT.get(djs).add(xjs);
|
|
|
|
+ } else {
|
|
|
|
+ Set<String> CL = new HashSet<>();
|
|
|
|
+ CL.add(xjs);
|
|
|
|
+ SCT.put(djs, CL);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ HashMap<String, Set<String>> SCT = new HashMap<>();
|
|
|
|
+ if (djs != null && xjs != null) {
|
|
|
|
+ Set<String> CL = new HashSet<>();
|
|
|
|
+ CL.add(xjs);
|
|
|
|
+ SCT.put(djs, CL);
|
|
|
|
+ }
|
|
|
|
+ All_NO_SH_tree.put(ss, SCT);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
+ e.printStackTrace();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * ## 分词:返回实体类【原地址:省:市:区:镇:居委:路牌地址:标准地址:区代码】
|
|
|
|
+ * * 1、初始化分词模型(读取村居边界.geojson\得到对应关系【所属区:区代码:所属街:居委_1】)(读取全国省市县记录.xlsx\得到对应关系【省:市:区县】)
|
|
|
|
+ * * 2、判断地址字符串是否存在[省、市、区、镇、街道、县]
|
|
|
|
+ * * 3、如果存在【省、市】判断是否是非上海市,是的话返回rule:0
|
|
|
|
+ * * 4、不知道是不是上海市的话,判断【区、镇】(先全词匹配,匹配不到的话模糊匹配)
|
|
|
|
+ * * 5、将匹配到的“区代码”拼接上310,否则直接是310000,作为搜索条件
|
|
|
|
+ * * 6、得到返回结果列表
|
|
|
|
+ * <p>
|
|
|
|
+ * // 省1:市2:区4:镇8:居委16
|
|
|
|
+ *
|
|
|
|
+ * @param addr
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ static Pattern pattern = Pattern.compile("市|区|镇|街道|县");
|
|
|
|
+ static Pattern spattern = Pattern.compile("路|街|道|村");
|
|
|
|
+
|
|
|
|
+ public static AddrBean outAddrMapInAddr(String addr) {
|
|
|
|
+ AddrBean addrMap = new AddrBean();
|
|
|
|
+ addrMap.setOldAddress(addr + "");
|
|
|
|
+ addrMap.setAddress(addr.replaceAll(" ", ""));
|
|
|
|
+ if (addr.contains("http")) {
|
|
|
|
+ // 先判断是否是链接
|
|
|
|
+ addrMap.setRule("-1");
|
|
|
|
+ } else {
|
|
|
|
+// 判断外地省名
|
|
|
|
+ boolean errorAddr = false;
|
|
|
|
+ for (String s : All_NO_SH_tree.keySet()) {
|
|
|
|
+ if (addr.startsWith(s) || (addr.startsWith(s.substring(0, 2)) && !ifTrueAddr(addr, s.substring(0, 2)))) {
|
|
|
|
+ addrMap.setProvinces(s);
|
|
|
|
+ addrMap.setRule("0");
|
|
|
|
+ if (!s.contains("上海")) {
|
|
|
|
+ errorAddr = true;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+// 判断外地市名
|
|
|
|
+ for (String m : All_NO_SH_tree.get(s).keySet()) {
|
|
|
|
+ if (addr.contains(m) && !addr.contains(m + "场")) {
|
|
|
|
+ addrMap.setProvinces(s);
|
|
|
|
+ addrMap.setMarket(m);
|
|
|
|
+ addrMap.setRule("-2");
|
|
|
|
+ errorAddr = true;
|
|
|
|
+ }
|
|
|
|
+ // 判断外地县名
|
|
|
|
+ for (String x : All_NO_SH_tree.get(s).get(m)) {
|
|
|
|
+ if (addr.contains(x) && !addr.contains(x + "场")) {
|
|
|
|
+ addrMap.setDistinguish(x);
|
|
|
|
+ addrMap.setProvinces(s);
|
|
|
|
+ addrMap.setMarket(m);
|
|
|
|
+ addrMap.setRule("-4");
|
|
|
|
+ errorAddr = true;
|
|
|
|
+ break;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// 如果不是外地数据和连接数据的话
|
|
|
|
+ if (!errorAddr) {
|
|
|
|
+// 上海地址匹配
|
|
|
|
+ if (pattern.matcher(addr).find()) {
|
|
|
|
+ if (addr.startsWith("上海")) {
|
|
|
|
+ addrMap.setProvinces("上海市");
|
|
|
|
+ addrMap.setMarket("上海市");
|
|
|
|
+ addrMap.setRule("2");
|
|
|
|
+ }
|
|
|
|
+// 匹配区
|
|
|
|
+ boolean ifContains = false;
|
|
|
|
+// 区匹配标识
|
|
|
|
+ String sh_distinguish = "";
|
|
|
|
+ for (String d : D_S_C_tree.keySet()) {
|
|
|
|
+ if (addr.contains(d) || addr.contains(d.substring(0, 2) + "县")) {
|
|
|
|
+ ifContains = true;
|
|
|
|
+ addrMap.setProvinces("上海市");
|
|
|
|
+ addrMap.setMarket("上海市");
|
|
|
|
+ addrMap.setDistinguish(d);
|
|
|
|
+ sh_distinguish = d;
|
|
|
|
+ addrMap.setRule("4");
|
|
|
|
+ break;
|
|
|
|
+ }
|
|
|
|
+ if (addr.contains(d.substring(0, 2)) && ifTrueAddr(addr, d.substring(0, 2))) {
|
|
|
|
+ addrMap.setProvinces("上海市");
|
|
|
|
+ addrMap.setMarket("上海市");
|
|
|
|
+ addrMap.setDistinguish(d);
|
|
|
|
+ sh_distinguish = d;
|
|
|
|
+ addrMap.setRule("4");
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+// 镇匹配
|
|
|
|
+ for (String d : D_S_C_tree.keySet()) {
|
|
|
|
+ for (String s : D_S_C_tree.get(d).keySet()) {
|
|
|
|
+ if (addr.contains(s)) {
|
|
|
|
+ addrMap.setProvinces("上海市");
|
|
|
|
+ addrMap.setMarket("上海市");
|
|
|
|
+ addrMap.setDistinguish(d);
|
|
|
|
+ addrMap.setStreetTown(s);
|
|
|
|
+ addrMap.setRule("8");
|
|
|
|
+ break;
|
|
|
|
+ }
|
|
|
|
+ if (addr.contains(s.substring(0, 2)) && ifContains && !sh_distinguish.isEmpty() && sh_distinguish.contains(d)) {
|
|
|
|
+ addrMap.setProvinces("上海市");
|
|
|
|
+ addrMap.setMarket("上海市");
|
|
|
|
+ addrMap.setDistinguish(d);
|
|
|
|
+ addrMap.setStreetTown(s);
|
|
|
|
+ addrMap.setRule("8");
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+// 特殊处理逻辑
|
|
|
|
+ if (addrMap.getDistinguish() != null && addrMap.getAddress() != null && addrMap.getDistinguish().contains("松江区") && addrMap.getAddress().contains("工业区")) {
|
|
|
|
+ addrMap.setStreetTown("松江技术开发区");
|
|
|
|
+ if (addrMap.getAddress().split("工业区").length > 1) {
|
|
|
|
+ addrMap.setAddress(addrMap.getAddress().split("工业区")[1]);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+// 输出路牌
|
|
|
|
+ if (addrMap.getProvinces() != null && !addrMap.getProvinces().isEmpty() && addrMap.getAddress().contains(addrMap.getProvinces())) {
|
|
|
|
+ if (addrMap.getAddress().split(addrMap.getProvinces()).length > 1) {
|
|
|
|
+ addrMap.setAddress(addrMap.getAddress().split(addrMap.getProvinces())[1]);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (addrMap.getMarket() != null && !addrMap.getMarket().isEmpty() && addrMap.getAddress().contains(addrMap.getMarket())) {
|
|
|
|
+ if (addrMap.getAddress().split(addrMap.getMarket()).length > 1) {
|
|
|
|
+ addrMap.setAddress(addrMap.getAddress().split(addrMap.getMarket())[1]);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (addrMap.getDistinguish() != null && !addrMap.getDistinguish().isEmpty()) {
|
|
|
|
+ if (addrMap.getAddress().contains(addrMap.getDistinguish())) {
|
|
|
|
+ if (addrMap.getAddress().split(addrMap.getDistinguish()).length > 1) {
|
|
|
|
+ addrMap.setAddress(addrMap.getAddress().split(addrMap.getDistinguish())[1]);
|
|
|
|
+ }
|
|
|
|
+ } else if (addrMap.getAddress().contains(addrMap.getDistinguish().substring(0, 2) + "县")) {
|
|
|
|
+ if (addrMap.getAddress().split(addrMap.getDistinguish().substring(0, 2) + "县").length > 1) {
|
|
|
|
+ addrMap.setAddress(addrMap.getAddress().split(addrMap.getDistinguish().substring(0, 2) + "县")[1]);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (addrMap.getStreetTown() != null && !addrMap.getStreetTown().isEmpty() && addrMap.getAddress().contains(addrMap.getStreetTown())) {
|
|
|
|
+ if (addrMap.getAddress().split(addrMap.getStreetTown()).length > 1) {
|
|
|
|
+ addrMap.setAddress(addrMap.getAddress().split(addrMap.getStreetTown())[1]);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (addrMap.getResidentialCommittee() != null && !addrMap.getResidentialCommittee().isEmpty() && addrMap.getAddress().contains(addrMap.getResidentialCommittee())) {
|
|
|
|
+ if (addrMap.getAddress().split(addrMap.getResidentialCommittee()).length > 1) {
|
|
|
|
+ addrMap.setAddress(addrMap.getAddress().split(addrMap.getResidentialCommittee())[1]);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (addrMap.getAddress().contains("委会") || addrMap.getAddress().contains("员会")) {
|
|
|
|
+ addrMap.setAddress(addrMap.getAddress().substring(Math.max(addrMap.getAddress().indexOf("委会"), addrMap.getAddress().indexOf("员会")) + 2));
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return addrMap;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 判断是否是名称+路名|街名的格式
|
|
|
|
+ *
|
|
|
|
+ * @param addr
|
|
|
|
+ * @param tagStr
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ public static boolean ifTrueAddr(String addr, String tagStr) {
|
|
|
|
+ String endStr = addr.substring(addr.indexOf(tagStr) + tagStr.length(), Math.min(addr.length(), addr.indexOf(tagStr) + tagStr.length() + 2));
|
|
|
|
+ if (spattern.matcher(endStr).find()) {
|
|
|
|
+ return true;
|
|
|
|
+ }
|
|
|
|
+ return false;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public static void main(String[] args) {
|
|
|
|
+ AddrSplitLmrMap AddrSplitLmrMap = new AddrSplitLmrMap();
|
|
|
|
+ AddrSplitLmrMap.initFile();
|
|
|
|
+ System.out.println(outAddrMapInAddr("村165号"));
|
|
|
|
+ System.out.println(outAddrMapInAddr("河南驻马店汝南县东官庄镇"));
|
|
|
|
+ System.out.println(outAddrMapInAddr("云南省昭通市昭阳区永丰镇绿荫社区居民委员会管湾村二十五组205号"));
|
|
|
|
+ }
|
|
|
|
+}
|