# * coding:utf-8_*_
# 作者 :XiangLin
# 创建时间 :12/02/2020 11:28
# 文件 :2-6_Cleaning_Data.py
# IDE :PyCharm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
plt.style.use("bmh")
plt.rc('font', family='SimHei', size=25) #显示中文
pd.set_option('display.max_columns',1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',1000)
什么样的数据叫做脏数据/有问题的数据?
我们用NYC 311服务请求数据来一起看看,这个数据量不算小,同时也有一些东西确实可以处理一下。
requests = pd.read_csv('311-service-requests.csv')
print(requests.head())
输出:
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address Street Name Cross Street 1 Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type City Landmark Facility Type Status Due Date Resolution Action Updated Date Community Board Borough X Coordinate (State Plane) Y Coordinate (State Plane) Park Facility Name Park Borough School Name School Number School Region School Code School Phone Number School Address School City School State School Zip School Not Found School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location
0 26589651 10/31/2013 02:08:41 AM NaN NYPD New York City Police Department Noise - Street/Sidewalk Loud Talking Street/Sidewalk 11432 90-03 169 STREET 169 STREET 90 AVENUE 91 AVENUE NaN NaN ADDRESS JAMAICA NaN Precinct Assigned 10/31/2013 10:08:41 AM 10/31/2013 02:35:17 AM 12 QUEENS QUEENS 1042027.0 197389.0 Unspecified QUEENS Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 40.708275 -73.791604 (40.70827532593202, -73.79160395779721)
1 26593698 10/31/2013 02:01:04 AM NaN NYPD New York City Police Department Illegal Parking Commercial Overnight Parking Street/Sidewalk 11378 58 AVENUE 58 AVENUE 58 PLACE 59 STREET NaN NaN BLOCKFACE MASPETH NaN Precinct Open 10/31/2013 10:01:04 AM NaN 05 QUEENS QUEENS 1009349.0 201984.0 Unspecified QUEENS Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 40.721041 -73.909453 (40.721040535628305, -73.90945306791765)
2 26594139 10/31/2013 02:00:24 AM 10/31/2013 02:40:32 AM NYPD New York City Police Department Noise - Commercial Loud Music/Party Club/Bar/Restaurant 10032 4060 BROADWAY BROADWAY WEST 171 STREET WEST 172 STREET NaN NaN ADDRESS NEW YORK NaN Precinct Closed 10/31/2013 10:00:24 AM 10/31/2013 02:39:42 AM 12 MANHATTAN MANHATTAN 1001088.0 246531.0 Unspecified MANHATTAN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 40.843330 -73.939144 (40.84332975466513, -73.93914371913482)
3 26595721 10/31/2013 01:56:23 AM 10/31/2013 02:21:48 AM NYPD New York City Police Department Noise - Vehicle Car/Truck Horn Street/Sidewalk 10023 WEST 72 STREET WEST 72 STREET COLUMBUS AVENUE AMSTERDAM AVENUE NaN NaN BLOCKFACE NEW YORK NaN Precinct Closed 10/31/2013 09:56:23 AM 10/31/2013 02:21:10 AM 07 MANHATTAN MANHATTAN 989730.0 222727.0 Unspecified MANHATTAN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 40.778009 -73.980213 (40.7780087446372, -73.98021349023975)
4 26590930 10/31/2013 01:53:44 AM NaN DOHMH Department of Health and Mental Hygiene Rodent Condition Attracting Rodents Vacant Lot 10027 WEST 124 STREET WEST 124 STREET LENOX AVENUE ADAM CLAYTON POWELL JR BOULEVARD NaN NaN BLOCKFACE NEW YORK NaN NaN Pending 11/30/2013 01:53:44 AM 10/31/2013 01:59:54 AM 10 MANHATTAN MANHATTAN 998815.0 233545.0 Unspecified MANHATTAN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 40.807691 -73.947387 (40.80769092704951, -73.94738703491433)
其实也没有特别好的办法,还是得先拿点数据出来看看。比如说我们这里观察到邮政编码可能有问题的字段。
需要提到的一点是 .unique() 函数有很巧的用处,我们把所有出现过的邮政编码列出来(之后再看看分布?),也许会有一些想法。
下面我们就把unique()用起来,然后你会发现,确确实实是存在一些问题的,比如:
为什么大部分被解析出数值,而有些被解析出字符串了?
好多缺省值(nan)
格式不一样,有些是29616-0759,有些是83
有一些pandas不认的,比如’N/A’或者’NO CLUE’
那我们能做什么呢?
规整’N/A’和’NO CLUE’到缺省值的“队列”里
看看83是什么鬼,然后再决定怎么处理
统一一下,全处理成字符串好啦
print(requests['Incident Zip'].unique())
[11432.0 11378.0 10032.0 10023.0 10027.0 11372.0 11419.0 11417.0 10011.0
11225.0 11218.0 10003.0 10029.0 10466.0 11219.0 10025.0 10310.0 11236.0
nan 10033.0 11216.0 10016.0 10305.0 10312.0 10026.0 10309.0 10036.0
11433.0 11235.0 11213.0 11379.0 11101.0 10014.0 11231.0 11234.0 10457.0
10459.0 10465.0 11207.0 10002.0 10034.0 11233.0 10453.0 10456.0 10469.0
11374.0 11221.0 11421.0 11215.0 10007.0 10019.0 11205.0 11418.0 11369.0
11249.0 10005.0 10009.0 11211.0 11412.0 10458.0 11229.0 10065.0 10030.0
11222.0 10024.0 10013.0 11420.0 11365.0 10012.0 11214.0 11212.0 10022.0
11232.0 11040.0 11226.0 10281.0 11102.0 11208.0 10001.0 10472.0 11414.0
11223.0 10040.0 11220.0 11373.0 11203.0 11691.0 11356.0 10017.0 10452.0
10280.0 11217.0 10031.0 11201.0 11358.0 10128.0 11423.0 10039.0 10010.0
11209.0 10021.0 10037.0 11413.0 11375.0 11238.0 10473.0 11103.0 11354.0
11361.0 11106.0 11385.0 10463.0 10467.0 11204.0 11237.0 11377.0 11364.0
11434.0 11435.0 11210.0 11228.0 11368.0 11694.0 10464.0 11415.0 10314.0
10301.0 10018.0 10038.0 11105.0 11230.0 10468.0 11104.0 10471.0 11416.0
10075.0 11422.0 11355.0 10028.0 10462.0 10306.0 10461.0 11224.0 11429.0
10035.0 11366.0 11362.0 11206.0 10460.0 10304.0 11360.0 11411.0 10455.0
10475.0 10069.0 10303.0 10308.0 10302.0 11357.0 10470.0 11367.0 11370.0
10454.0 10451.0 11436.0 11426.0 10153.0 11004.0 11428.0 11427.0 11001.0
11363.0 10004.0 10474.0 11430.0 10000.0 10307.0 11239.0 10119.0 10006.0
10048.0 11697.0 11692.0 11693.0 10573.0 83.0 11559.0 10020.0 77056.0
11776.0 70711.0 10282.0 11109.0 10044.0 '10452' '11233' '10468' '10310'
'11105' '10462' '10029' '10301' '10457' '10467' '10469' '11225' '10035'
'10031' '11226' '10454' '11221' '10025' '11229' '11235' '11422' '10472'
'11208' '11102' '10032' '11216' '10473' '10463' '11213' '10040' '10302'
'11231' '10470' '11204' '11104' '11212' '10466' '11416' '11214' '10009'
'11692' '11385' '11423' '11201' '10024' '11435' '10312' '10030' '11106'
'10033' '10303' '11215' '11222' '11354' '10016' '10034' '11420' '10304'
'10019' '11237' '11249' '11230' '11372' '11207' '11378' '11419' '11361'
'10011' '11357' '10012' '11358' '10003' '10002' '11374' '10007' '11234'
'10065' '11369' '11434' '11205' '11206' '11415' '11236' '11218' '11413'
'10458' '11101' '10306' '11355' '10023' '11368' '10314' '11421' '10010'
'10018' '11223' '10455' '11377' '11433' '11375' '10037' '11209' '10459'
'10128' '10014' '10282' '11373' '10451' '11238' '11211' '10038' '11694'
'11203' '11691' '11232' '10305' '10021' '11228' '10036' '10001' '10017'
'11217' '11219' '10308' '10465' '11379' '11414' '10460' '11417' '11220'
'11366' '10027' '11370' '10309' '11412' '11356' '10456' '11432' '10022'
'10013' '11367' '11040' '10026' '10475' '11210' '11364' '11426' '10471'
'10119' '11224' '11418' '11429' '11365' '10461' '11239' '10039' '00083'
'11411' '10075' '11004' '11360' '10453' '10028' '11430' '10307' '11103'
'10004' '10069' '10005' '10474' '11428' '11436' '10020' '11001' '11362'
'11693' '10464' '11427' '10044' '11363' '10006' '10000' '02061'
'77092-2016' '10280' '11109' '14225' '55164-0737' '19711' '07306'
'000000' 'NO CLUE' '90010' '10281' '11747' '23541' '11776' '11697'
'11788' '07604' 10112.0 11788.0 11563.0 11580.0 7087.0 11042.0 7093.0
11501.0 92123.0 0.0 11575.0 7109.0 11797.0 '10803' '11716' '11722'
'11549-3650' '10162' '92123' '23502' '11518' '07020' '08807' '11577'
'07114' '11003' '07201' '11563' '61702' '10103' '29616-0759' '35209-3114'
'11520' '11735' '10129' '11005' '41042' '11590' 6901.0 7208.0 11530.0
13221.0 10954.0 11735.0 10103.0 7114.0 11111.0 10107.0]
我们可以在pd.read_csv读数据的时候,传一个na_values给它,清理掉一部分的脏数据,我们还可以指明说,我们就要保证邮政编码是字符串型的,不要给我整些数值型出来!!
na_values = ['NO CLUE','N/A','0']
requests = pd.read_csv('311-service-requests.csv',na_values=na_values,dtype={'Incident Zip':str})
print(requests['Incident Zip'].unique())
输出:
['11432' '11378' '10032' '10023' '10027' '11372' '11419' '11417' '10011'
'11225' '11218' '10003' '10029' '10466' '11219' '10025' '10310' '11236'
nan '10033' '11216' '10016' '10305' '10312' '10026' '10309' '10036'
'11433' '11235' '11213' '11379' '11101' '10014' '11231' '11234' '10457'
'10459' '10465' '11207' '10002' '10034' '11233' '10453' '10456' '10469'
'11374' '11221' '11421' '11215' '10007' '10019' '11205' '11418' '11369'
'11249' '10005' '10009' '11211' '11412' '10458' '11229' '10065' '10030'
'11222' '10024' '10013' '11420' '11365' '10012' '11214' '11212' '10022'
'11232' '11040' '11226' '10281' '11102' '11208' '10001' '10472' '11414'
'11223' '10040' '11220' '11373' '11203' '11691' '11356' '10017' '10452'
'10280' '11217' '10031' '11201' '11358' '10128' '11423' '10039' '10010'
'11209' '10021' '10037' '11413' '11375' '11238' '10473' '11103' '11354'
'11361' '11106' '11385' '10463' '10467' '11204' '11237' '11377' '11364'
'11434' '11435' '11210' '11228' '11368' '11694' '10464' '11415' '10314'
'10301' '10018' '10038' '11105' '11230' '10468' '11104' '10471' '11416'
'10075' '11422' '11355' '10028' '10462' '10306' '10461' '11224' '11429'
'10035' '11366' '11362' '11206' '10460' '10304' '11360' '11411' '10455'
'10475' '10069' '10303' '10308' '10302' '11357' '10470' '11367' '11370'
'10454' '10451' '11436' '11426' '10153' '11004' '11428' '11427' '11001'
'11363' '10004' '10474' '11430' '10000' '10307' '11239' '10119' '10006'
'10048' '11697' '11692' '11693' '10573' '00083' '11559' '10020' '77056'
'11776' '70711' '10282' '11109' '10044' '02061' '77092-2016' '14225'
'55164-0737' '19711' '07306' '000000' '90010' '11747' '23541' '11788'
'07604' '10112' '11563' '11580' '07087' '11042' '07093' '11501' '92123'
'00000' '11575' '07109' '11797' '10803' '11716' '11722' '11549-3650'
'10162' '23502' '11518' '07020' '08807' '11577' '07114' '11003' '07201'
'61702' '10103' '29616-0759' '35209-3114' '11520' '11735' '10129' '11005'
'41042' '11590' '06901' '07208' '11530' '13221' '10954' '11111' '10107']
xx = requests.loc[requests['Incident Zip'].str.contains('-').fillna(False),'Incident Zip']
print(xx)
rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
print(len(requests[rows_with_dashes]))
# 真心是很烦人啊,其实只有5个,输出来看看是什么
print(requests[rows_with_dashes])
输出:
29136 77092-2016
30939 55164-0737
70539 11549-3650
85821 29616-0759
89304 35209-3114
Name: Incident Zip, dtype: object
5
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address Street Name Cross Street 1 Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type City Landmark Facility Type Status Due Date Resolution Action Updated Date Community Board Borough X Coordinate (State Plane) Y Coordinate (State Plane) Park Facility Name Park Borough School Name School Number School Region School Code School Phone Number School Address School City School State School Zip School Not Found School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location
29136 26550551 10/24/2013 06:16:34 PM NaN DCA Department of Consumer Affairs Consumer Complaint False Advertising NaN 77092-2016 2700 EAST SELTICE WAY EAST SELTICE WAY NaN NaN NaN NaN NaN HOUSTON NaN NaN Assigned 11/13/2013 11:15:20 AM 10/29/2013 11:16:16 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
30939 26548831 10/24/2013 09:35:10 AM NaN DCA Department of Consumer Affairs Consumer Complaint Harassment NaN 55164-0737 P.O. BOX 64437 64437 NaN NaN NaN NaN NaN ST. PAUL NaN NaN Assigned 11/13/2013 02:30:21 PM 10/29/2013 02:31:06 PM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
70539 26488417 10/15/2013 03:40:33 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint Street 11549-3650 365 HOFSTRA UNIVERSITY HOFSTRA UNIVERSITY NaN NaN NaN NaN NaN HEMSTEAD NaN NaN Assigned 11/30/2013 01:20:33 PM 10/16/2013 01:21:39 PM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN La Guardia Airport NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
85821 26468296 10/10/2013 12:36:43 PM 10/26/2013 01:07:07 AM DCA Department of Consumer Affairs Consumer Complaint Debt Not Owed NaN 29616-0759 PO BOX 25759 BOX 25759 NaN NaN NaN NaN NaN GREENVILLE NaN NaN Closed 10/26/2013 09:20:28 AM 10/26/2013 01:07:07 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
89304 26461137 10/09/2013 05:23:46 PM 10/25/2013 01:06:41 AM DCA Department of Consumer Affairs Consumer Complaint Harassment NaN 35209-3114 600 BEACON PKWY BEACON PKWY NaN NaN NaN NaN NaN BIRMINGHAM NaN NaN Closed 10/25/2013 02:43:42 PM 10/25/2013 01:06:41 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
本来就5个,打算直接把这些都设置成缺省值(nan)的:requests[‘Incident Zip’][rows_with_dashes] = np.nan 后来查了查,发现可能前5位置是真实的邮编,所以干脆截取一下好了。
long_zip_codes = requests['Incident Zip'].str.len() > 5
print(requests['Incident Zip'][long_zip_codes].unique())
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0,5)
输出:
['77092-2016' '55164-0737' '000000' '11549-3650' '29616-0759' '35209-3114']
搞定啦!
妈蛋查了下00000,发现根本不是什么美国加拿大的邮编,所以这个是不能这么处理的,还真得重新设为缺省值。
print(requests[requests['Incident Zip'] == '00000'])
zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips,'Incident Zip'] = np.nan
# 完工!!再来看看现在的数据什么样了。
unique_zips = requests['Incident Zip'].unique()
#unique_zips.sort_values()
print(unique_zips)
输出:
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address Street Name Cross Street 1 Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type City Landmark Facility Type Status Due Date Resolution Action Updated Date Community Board Borough X Coordinate (State Plane) Y Coordinate (State Plane) Park Facility Name Park Borough School Name School Number School Region School Code School Phone Number School Address School City School State School Zip School Not Found School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location
42600 26529313 10/22/2013 02:51:06 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint NaN 00000 EWR EWR EWR NaN NaN NaN NaN NaN NEWARK NaN NaN Assigned 12/07/2013 09:53:51 AM 10/23/2013 09:54:43 AM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN Other NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
60843 26507389 10/17/2013 05:48:44 PM NaN TLC Taxi and Limousine Commission Taxi Complaint Driver Complaint Street 00000 1 NEWARK AIRPORT NEWARK AIRPORT NaN NaN NaN NaN NaN NEWARK NaN NaN Assigned 12/02/2013 11:59:46 AM 10/18/2013 12:01:08 PM 0 Unspecified Unspecified NaN NaN Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N NaN NaN NaN Other NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
['11432' '11378' '10032' '10023' '10027' '11372' '11419' '11417' '10011'
'11225' '11218' '10003' '10029' '10466' '11219' '10025' '10310' '11236'
nan '10033' '11216' '10016' '10305' '10312' '10026' '10309' '10036'
'11433' '11235' '11213' '11379' '11101' '10014' '11231' '11234' '10457'
'10459' '10465' '11207' '10002' '10034' '11233' '10453' '10456' '10469'
'11374' '11221' '11421' '11215' '10007' '10019' '11205' '11418' '11369'
'11249' '10005' '10009' '11211' '11412' '10458' '11229' '10065' '10030'
'11222' '10024' '10013' '11420' '11365' '10012' '11214' '11212' '10022'
'11232' '11040' '11226' '10281' '11102' '11208' '10001' '10472' '11414'
'11223' '10040' '11220' '11373' '11203' '11691' '11356' '10017' '10452'
'10280' '11217' '10031' '11201' '11358' '10128' '11423' '10039' '10010'
'11209' '10021' '10037' '11413' '11375' '11238' '10473' '11103' '11354'
'11361' '11106' '11385' '10463' '10467' '11204' '11237' '11377' '11364'
'11434' '11435' '11210' '11228' '11368' '11694' '10464' '11415' '10314'
'10301' '10018' '10038' '11105' '11230' '10468' '11104' '10471' '11416'
'10075' '11422' '11355' '10028' '10462' '10306' '10461' '11224' '11429'
'10035' '11366' '11362' '11206' '10460' '10304' '11360' '11411' '10455'
'10475' '10069' '10303' '10308' '10302' '11357' '10470' '11367' '11370'
'10454' '10451' '11436' '11426' '10153' '11004' '11428' '11427' '11001'
'11363' '10004' '10474' '11430' '10000' '10307' '11239' '10119' '10006'
'10048' '11697' '11692' '11693' '10573' '00083' '11559' '10020' '77056'
'11776' '70711' '10282' '11109' '10044' '02061' '77092' '14225' '55164'
'19711' '07306' '90010' '11747' '23541' '11788' '07604' '10112' '11563'
'11580' '07087' '11042' '07093' '11501' '92123' '11575' '07109' '11797'
'10803' '11716' '11722' '11549' '10162' '23502' '11518' '07020' '08807'
'11577' '07114' '11003' '07201' '61702' '10103' '29616' '35209' '11520'
'11735' '10129' '11005' '41042' '11590' '06901' '07208' '11530' '13221'
'10954' '11111' '10107']
看起来干净多了。
但是真的做完了吗?
zips = requests['Incident Zip']
# 用is_close表示0或者1开始的比较正确的邮编
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# 非缺省值但不以0或者1开始的邮编认为是有些困惑的
is_far = ~(is_close) & zips.notnull()
print(zips[is_far])
# 可以排个序,然后对应输出一些东西
requests_zip = requests[is_far][['Incident Zip','Descriptor','City']].sort_values('Incident Zip')
print(requests_zip)
输出:
12102 77056
13450 70711
29136 77092
30939 55164
44008 90010
47048 23541
57636 92123
71001 92123
71834 23502
80573 61702
85821 29616
89304 35209
94201 41042
Name: Incident Zip, dtype: object
Incident Zip Descriptor City
71834 23502 Harassment NORFOLK
47048 23541 Harassment NORFOLK
85821 29616 Debt Not Owed GREENVILLE
89304 35209 Harassment BIRMINGHAM
94201 41042 Harassment FLORENCE
30939 55164 Harassment ST. PAUL
80573 61702 Billing Dispute BLOOMIGTON
13450 70711 Contract Dispute CLIFTON
12102 77056 Debt Not Owed HOUSTON
29136 77092 False Advertising HOUSTON
44008 90010 Billing Dispute LOS ANGELES
57636 92123 Harassment SAN DIEGO
71001 92123 Billing Dispute SAN DIEGO
咳咳,突然觉得,恩,刚才做的一大堆工作,其实只是告诉你,我们可以这样去处理和补齐数据。
但你实际上会发现,好像其实用city直接对应一下就可以补上一些东西啊。
city_counts = requests['City'].str.upper().value_counts()
print(city_counts)
输出:
BROOKLYN 31662
NEW YORK 22664
BRONX 18438
STATEN ISLAND 4766
JAMAICA 2246
FLUSHING 1803
ASTORIA 1568
RIDGEWOOD 1073
CORONA 707
OZONE PARK 693
LONG ISLAND CITY 678
FAR ROCKAWAY 652
ELMHURST 647
WOODSIDE 609
EAST ELMHURST 562
QUEENS VILLAGE 549
FOREST HILLS 541
JACKSON HEIGHTS 541
SOUTH RICHMOND HILL 521
MASPETH 473
WOODHAVEN 464
FRESH MEADOWS 435
SPRINGFIELD GARDENS 434
BAYSIDE 411
SOUTH OZONE PARK 410
RICHMOND HILL 404
REGO PARK 402
MIDDLE VILLAGE 396
SAINT ALBANS 387
WHITESTONE 348
...
WEST NEW YORK 1
UNION CITY 1
LOS ANGELES 1
WOODBURY 1
BRIDGE WATER 1
NEW YOR 1
FREEPORT 1
COL.ANVURES 1
BRIARWOOD 1
JERSEY CITY 1
ROSELYN 1
NANUET 1
HASBROCK HEIGHTS 1
SYRACUSE 1
NEWARK AIRPORT 1
BLOOMIGTON 1
CENTRAL ISLIP 1
EDGEWATER 1
ROSLYN 1
NORWELL 1
STAMFORD 1
FLORENCE 1
EAST ROCKAWAY 1
BIRMINGHAM 1
RYEBROOK 1
HEMSTEAD 1
LYNBROOK 1
BOHIEMA 1
LAWRENCE 1
ELIZABETH 1
Name: City, Length: 100, dtype: int64
所以汇总一下,我们在邮编这个字段,是这样做数据清洗的:
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('311-service-requests.csv',na_values = na_values,dtype={'Incident Zip': str})
def fix_zip_codes(zips):
# Truncate everything to length 5
zips = zips.str.slice(0,5)
# Set 00000 zip codes to nan
zero_zips = zips == '00000'
zips[zero_zips] = np.nan
return zips
requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])
result = requests['Incident Zip'].unique()
print(result)
输出:
['11432' '11378' '10032' '10023' '10027' '11372' '11419' '11417' '10011'
'11225' '11218' '10003' '10029' '10466' '11219' '10025' '10310' '11236'
nan '10033' '11216' '10016' '10305' '10312' '10026' '10309' '10036'
'11433' '11235' '11213' '11379' '11101' '10014' '11231' '11234' '10457'
'10459' '10465' '11207' '10002' '10034' '11233' '10453' '10456' '10469'
'11374' '11221' '11421' '11215' '10007' '10019' '11205' '11418' '11369'
'11249' '10005' '10009' '11211' '11412' '10458' '11229' '10065' '10030'
'11222' '10024' '10013' '11420' '11365' '10012' '11214' '11212' '10022'
'11232' '11040' '11226' '10281' '11102' '11208' '10001' '10472' '11414'
'11223' '10040' '11220' '11373' '11203' '11691' '11356' '10017' '10452'
'10280' '11217' '10031' '11201' '11358' '10128' '11423' '10039' '10010'
'11209' '10021' '10037' '11413' '11375' '11238' '10473' '11103' '11354'
'11361' '11106' '11385' '10463' '10467' '11204' '11237' '11377' '11364'
'11434' '11435' '11210' '11228' '11368' '11694' '10464' '11415' '10314'
'10301' '10018' '10038' '11105' '11230' '10468' '11104' '10471' '11416'
'10075' '11422' '11355' '10028' '10462' '10306' '10461' '11224' '11429'
'10035' '11366' '11362' '11206' '10460' '10304' '11360' '11411' '10455'
'10475' '10069' '10303' '10308' '10302' '11357' '10470' '11367' '11370'
'10454' '10451' '11436' '11426' '10153' '11004' '11428' '11427' '11001'
'11363' '10004' '10474' '11430' '10000' '10307' '11239' '10119' '10006'
'10048' '11697' '11692' '11693' '10573' '00083' '11559' '10020' '77056'
'11776' '70711' '10282' '11109' '10044' '02061' '77092' '14225' '55164'
'19711' '07306' '90010' '11747' '23541' '11788' '07604' '10112' '11563'
'11580' '07087' '11042' '07093' '11501' '92123' '11575' '07109' '11797'
'10803' '11716' '11722' '11549' '10162' '23502' '11518' '07020' '08807'
'11577' '07114' '11003' '07201' '61702' '10103' '29616' '35209' '11520'
'11735' '10129' '11005' '41042' '11590' '06901' '07208' '11530' '13221'
'10954' '11111' '10107']
pandas.read_csv(filepath_or_buffer, sep=', ', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)
sep=',' # 以,为数据分隔符
shkiprows= 10 # 跳过前十行
nrows = 10 # 只去前10行
parse_dates = ['col_name'] # 指定某行读取为日期格式
index_col = ['col_1','col_2'] # 读取指定的几列
error_bad_lines = False # 当某行数据有问题时,不报错,直接跳过,处理脏数据时使用
na_values = 'NULL' # 将NULL识别为空值
数据链接:链接:https://pan.baidu.com/s/1BX9z2WeAHrA0hduuf1OD0A
提取码:o2o8
来自七月在线数据挖掘算法
向林
2020年2月13日于重庆城口
好好学习,天天向上,终有所获