FuzzyWuzzy 是一个简单易用的模糊字符串匹配工具包。它依据 Levenshtein Distance 算法,计算两个序列之间的差异。
Levenshtein Distance算法,又叫 Edit Distance算法,是指两个字符串之间,由一个转成另一个所需的最少编辑操作次数。许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。一般来说,编辑距离越小,两个串的相似度越大。
from sqlalchemy import create_engine,Table,Column,Date,Integer,String,ForeignKey
import os
import pymssql
import pymysql
import datetime
import time
import sys
import numpy
from text2vec import Similarity
from gensim import similarities
import paramiko
from smb.SMBConnection import *
import csv
import re
import sqlalchemy
import pandas as pd
from sqlalchemy import MetaData,create_engine,Table,Integer
import socket
import fuzzywuzzy
import pandarallel
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas
parallel.initialize()
t1 = time.time()
db_host=‘10.1*****’
db_user=‘**admin’
db_passwd = ‘****’
db_port=‘20001’
db_database='ODS_’
def fuzzy_match(x,choices,scorer,cutoff):
return process.extractOne(x,choices = choices, scorer = scorer,score_cutoff = cutoff)[0]
def fuzzy_score(x,choices,scorer,cutoff):
return process.extractOne(x,choices = choices,scorer = scorer,score_cutoff = cutoff)[1]
def db_conn(db_host,db_user,db_passwd,db_database,db_port):
conn=pymssql.connect(server=db_host,user=db_user,password=db_passwd,database=db_database,port=db_port)
cur=conn.cursor()
if not cur:
raise Exception('数据库连接失败')
else:print('succeed')
df1 = pd.read_sql("select * from ODS_Legal_Ariba_VendorData_New",con=conn)
df2 = pd.read_sql("select * from ODS_Legal_WorldCheck_Name_Full",con=conn)
df1["match"] = df1['SupplierName'].apply(fuzzy_match,args = (df2['NAME'] , fuzz.ratio,0))
df1["SCORES"] = df1['SupplierName'].apply(fuzzy_score,args = (df2['NAME'],fuzz.ratio,0))
df1=df1.sort_values(by = 'SCORES',ascending = True)
print(df1)
df1.to_csv("./fuzzy_match2.csv",index=False)
t2 = time.time()
print(t2-t1)
if name==‘main’:
db_conn(db_host,db_user,db_passwd,db_database,db_port)