当前位置: 首页 > 工具软件 > Fuzzywuzzy > 使用案例 >

Python+FuzzyWuzzy实现模糊匹配并通过pandarallel进行多线程加速

苍元章
2023-12-01

Python+FuzzyWuzzy实现模糊匹配并通过pandarallel进行多线程加速

FuzzyWuzzy介绍

FuzzyWuzzy 是一个简单易用的模糊字符串匹配工具包。它依据 Levenshtein Distance 算法,计算两个序列之间的差异。
Levenshtein Distance算法,又叫 Edit Distance算法,是指两个字符串之间,由一个转成另一个所需的最少编辑操作次数。许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。一般来说,编辑距离越小,两个串的相似度越大。
from sqlalchemy import create_engine,Table,Column,Date,Integer,String,ForeignKey
import os
import pymssql
import pymysql
import datetime
import time
import sys
import numpy
from text2vec import Similarity
from gensim import similarities
import paramiko
from smb.SMBConnection import *
import csv
import re
import sqlalchemy
import pandas as pd
from sqlalchemy import MetaData,create_engine,Table,Integer
import socket
import fuzzywuzzy
import pandarallel
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas
parallel.initialize()
t1 = time.time()
db_host=‘10.1*****’
db_user=‘**admin’
db_passwd = ‘****
db_port=‘20001’
db_database='ODS_

def fuzzy_match(x,choices,scorer,cutoff):
return process.extractOne(x,choices = choices, scorer = scorer,score_cutoff = cutoff)[0]

def fuzzy_score(x,choices,scorer,cutoff):
return process.extractOne(x,choices = choices,scorer = scorer,score_cutoff = cutoff)[1]

def db_conn(db_host,db_user,db_passwd,db_database,db_port):
conn=pymssql.connect(server=db_host,user=db_user,password=db_passwd,database=db_database,port=db_port)
cur=conn.cursor()

if not cur:                                                                                                                                                     
    raise Exception('数据库连接失败')                                                                                                                           
else:print('succeed')                                                                                                                                           
df1 = pd.read_sql("select * from ODS_Legal_Ariba_VendorData_New",con=conn)                                                                                      
df2 = pd.read_sql("select * from ODS_Legal_WorldCheck_Name_Full",con=conn)                                                                                      
df1["match"] = df1['SupplierName'].apply(fuzzy_match,args = (df2['NAME'] , fuzz.ratio,0))                                                                       
df1["SCORES"] = df1['SupplierName'].apply(fuzzy_score,args = (df2['NAME'],fuzz.ratio,0))                                                                        
                                                                                                                                                                
df1=df1.sort_values(by = 'SCORES',ascending = True)     
print(df1)                                                                                                                                                      
df1.to_csv("./fuzzy_match2.csv",index=False)                                                                                                                    
                                                                                                                                                                
t2 = time.time()                                                                                                                                                
print(t2-t1)                                                                                                                                                    

if name==‘main’:
db_conn(db_host,db_user,db_passwd,db_database,db_port)

 类似资料: