2-6_Cleaning_Data清洗数据

方昊
2023-12-01


清洗数据
清洗和处理数据通常也是非常重要一个环节,这节提提这个内容。

# * coding:utf-8_*_
# 作者     :XiangLin
# 创建时间 :12/02/2020 11:28
# 文件     :2-6_Cleaning_Data.py
# IDE      :PyCharm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
plt.style.use("bmh")
plt.rc('font', family='SimHei', size=25) #显示中文
pd.set_option('display.max_columns',1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',1000)

什么样的数据叫做脏数据/有问题的数据?

我们用NYC 311服务请求数据来一起看看,这个数据量不算小,同时也有一些东西确实可以处理一下。

requests = pd.read_csv('311-service-requests.csv')
print(requests.head())
输出:
  Unique Key            Created Date             Closed Date Agency                              Agency Name           Complaint Type                    Descriptor        Location Type Incident Zip  Incident Address      Street Name   Cross Street 1                    Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type      City Landmark Facility Type    Status                Due Date Resolution Action Updated Date Community Board    Borough  X Coordinate (State Plane)  Y Coordinate (State Plane) Park Facility Name Park Borough  School Name School Number School Region  School Code School Phone Number School Address  School City School State   School Zip School Not Found  School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name   Latitude  Longitude                                  Location
0    26589651  10/31/2013 02:08:41 AM                     NaN   NYPD          New York City Police Department  Noise - Street/Sidewalk                  Loud Talking      Street/Sidewalk        11432  90-03 169 STREET       169 STREET        90 AVENUE                         91 AVENUE                   NaN                   NaN      ADDRESS   JAMAICA      NaN      Precinct  Assigned  10/31/2013 10:08:41 AM         10/31/2013 02:35:17 AM       12 QUEENS     QUEENS                   1042027.0                    197389.0        Unspecified       QUEENS  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN  40.708275 -73.791604   (40.70827532593202, -73.79160395779721)
1    26593698  10/31/2013 02:01:04 AM                     NaN   NYPD          New York City Police Department          Illegal Parking  Commercial Overnight Parking      Street/Sidewalk        11378         58 AVENUE        58 AVENUE         58 PLACE                         59 STREET                   NaN                   NaN    BLOCKFACE   MASPETH      NaN      Precinct      Open  10/31/2013 10:01:04 AM                            NaN       05 QUEENS     QUEENS                   1009349.0                    201984.0        Unspecified       QUEENS  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN  40.721041 -73.909453  (40.721040535628305, -73.90945306791765)
2    26594139  10/31/2013 02:00:24 AM  10/31/2013 02:40:32 AM   NYPD          New York City Police Department       Noise - Commercial              Loud Music/Party  Club/Bar/Restaurant        10032     4060 BROADWAY         BROADWAY  WEST 171 STREET                   WEST 172 STREET                   NaN                   NaN      ADDRESS  NEW YORK      NaN      Precinct    Closed  10/31/2013 10:00:24 AM         10/31/2013 02:39:42 AM    12 MANHATTAN  MANHATTAN                   1001088.0                    246531.0        Unspecified    MANHATTAN  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN  40.843330 -73.939144   (40.84332975466513, -73.93914371913482)
3    26595721  10/31/2013 01:56:23 AM  10/31/2013 02:21:48 AM   NYPD          New York City Police Department          Noise - Vehicle                Car/Truck Horn      Street/Sidewalk        10023    WEST 72 STREET   WEST 72 STREET  COLUMBUS AVENUE                  AMSTERDAM AVENUE                   NaN                   NaN    BLOCKFACE  NEW YORK      NaN      Precinct    Closed  10/31/2013 09:56:23 AM         10/31/2013 02:21:10 AM    07 MANHATTAN  MANHATTAN                    989730.0                    222727.0        Unspecified    MANHATTAN  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN  40.778009 -73.980213    (40.7780087446372, -73.98021349023975)
4    26590930  10/31/2013 01:53:44 AM                     NaN  DOHMH  Department of Health and Mental Hygiene                   Rodent  Condition Attracting Rodents           Vacant Lot        10027   WEST 124 STREET  WEST 124 STREET     LENOX AVENUE  ADAM CLAYTON POWELL JR BOULEVARD                   NaN                   NaN    BLOCKFACE  NEW YORK      NaN           NaN   Pending  11/30/2013 01:53:44 AM         10/31/2013 01:59:54 AM    10 MANHATTAN  MANHATTAN                    998815.0                    233545.0        Unspecified    MANHATTAN  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN  40.807691 -73.947387   (40.80769092704951, -73.94738703491433)

6.1 怎么找到脏数据?

其实也没有特别好的办法,还是得先拿点数据出来看看。比如说我们这里观察到邮政编码可能有问题的字段。

需要提到的一点是 .unique() 函数有很巧的用处,我们把所有出现过的邮政编码列出来(之后再看看分布?),也许会有一些想法。

下面我们就把unique()用起来,然后你会发现,确确实实是存在一些问题的,比如:

为什么大部分被解析出数值,而有些被解析出字符串了?
好多缺省值(nan)
格式不一样,有些是29616-0759,有些是83
有一些pandas不认的,比如’N/A’或者’NO CLUE’
那我们能做什么呢?

规整’N/A’和’NO CLUE’到缺省值的“队列”里
看看83是什么鬼,然后再决定怎么处理
统一一下,全处理成字符串好啦

print(requests['Incident Zip'].unique())
[11432.0 11378.0 10032.0 10023.0 10027.0 11372.0 11419.0 11417.0 10011.0
 11225.0 11218.0 10003.0 10029.0 10466.0 11219.0 10025.0 10310.0 11236.0
 nan 10033.0 11216.0 10016.0 10305.0 10312.0 10026.0 10309.0 10036.0
 11433.0 11235.0 11213.0 11379.0 11101.0 10014.0 11231.0 11234.0 10457.0
 10459.0 10465.0 11207.0 10002.0 10034.0 11233.0 10453.0 10456.0 10469.0
 11374.0 11221.0 11421.0 11215.0 10007.0 10019.0 11205.0 11418.0 11369.0
 11249.0 10005.0 10009.0 11211.0 11412.0 10458.0 11229.0 10065.0 10030.0
 11222.0 10024.0 10013.0 11420.0 11365.0 10012.0 11214.0 11212.0 10022.0
 11232.0 11040.0 11226.0 10281.0 11102.0 11208.0 10001.0 10472.0 11414.0
 11223.0 10040.0 11220.0 11373.0 11203.0 11691.0 11356.0 10017.0 10452.0
 10280.0 11217.0 10031.0 11201.0 11358.0 10128.0 11423.0 10039.0 10010.0
 11209.0 10021.0 10037.0 11413.0 11375.0 11238.0 10473.0 11103.0 11354.0
 11361.0 11106.0 11385.0 10463.0 10467.0 11204.0 11237.0 11377.0 11364.0
 11434.0 11435.0 11210.0 11228.0 11368.0 11694.0 10464.0 11415.0 10314.0
 10301.0 10018.0 10038.0 11105.0 11230.0 10468.0 11104.0 10471.0 11416.0
 10075.0 11422.0 11355.0 10028.0 10462.0 10306.0 10461.0 11224.0 11429.0
 10035.0 11366.0 11362.0 11206.0 10460.0 10304.0 11360.0 11411.0 10455.0
 10475.0 10069.0 10303.0 10308.0 10302.0 11357.0 10470.0 11367.0 11370.0
 10454.0 10451.0 11436.0 11426.0 10153.0 11004.0 11428.0 11427.0 11001.0
 11363.0 10004.0 10474.0 11430.0 10000.0 10307.0 11239.0 10119.0 10006.0
 10048.0 11697.0 11692.0 11693.0 10573.0 83.0 11559.0 10020.0 77056.0
 11776.0 70711.0 10282.0 11109.0 10044.0 '10452' '11233' '10468' '10310'
 '11105' '10462' '10029' '10301' '10457' '10467' '10469' '11225' '10035'
 '10031' '11226' '10454' '11221' '10025' '11229' '11235' '11422' '10472'
 '11208' '11102' '10032' '11216' '10473' '10463' '11213' '10040' '10302'
 '11231' '10470' '11204' '11104' '11212' '10466' '11416' '11214' '10009'
 '11692' '11385' '11423' '11201' '10024' '11435' '10312' '10030' '11106'
 '10033' '10303' '11215' '11222' '11354' '10016' '10034' '11420' '10304'
 '10019' '11237' '11249' '11230' '11372' '11207' '11378' '11419' '11361'
 '10011' '11357' '10012' '11358' '10003' '10002' '11374' '10007' '11234'
 '10065' '11369' '11434' '11205' '11206' '11415' '11236' '11218' '11413'
 '10458' '11101' '10306' '11355' '10023' '11368' '10314' '11421' '10010'
 '10018' '11223' '10455' '11377' '11433' '11375' '10037' '11209' '10459'
 '10128' '10014' '10282' '11373' '10451' '11238' '11211' '10038' '11694'
 '11203' '11691' '11232' '10305' '10021' '11228' '10036' '10001' '10017'
 '11217' '11219' '10308' '10465' '11379' '11414' '10460' '11417' '11220'
 '11366' '10027' '11370' '10309' '11412' '11356' '10456' '11432' '10022'
 '10013' '11367' '11040' '10026' '10475' '11210' '11364' '11426' '10471'
 '10119' '11224' '11418' '11429' '11365' '10461' '11239' '10039' '00083'
 '11411' '10075' '11004' '11360' '10453' '10028' '11430' '10307' '11103'
 '10004' '10069' '10005' '10474' '11428' '11436' '10020' '11001' '11362'
 '11693' '10464' '11427' '10044' '11363' '10006' '10000' '02061'
 '77092-2016' '10280' '11109' '14225' '55164-0737' '19711' '07306'
 '000000' 'NO CLUE' '90010' '10281' '11747' '23541' '11776' '11697'
 '11788' '07604' 10112.0 11788.0 11563.0 11580.0 7087.0 11042.0 7093.0
 11501.0 92123.0 0.0 11575.0 7109.0 11797.0 '10803' '11716' '11722'
 '11549-3650' '10162' '92123' '23502' '11518' '07020' '08807' '11577'
 '07114' '11003' '07201' '11563' '61702' '10103' '29616-0759' '35209-3114'
 '11520' '11735' '10129' '11005' '41042' '11590' 6901.0 7208.0 11530.0
 13221.0 10954.0 11735.0 10103.0 7114.0 11111.0 10107.0]

6.3 处理缺省值和字符串/浮点混乱

我们可以在pd.read_csv读数据的时候,传一个na_values给它,清理掉一部分的脏数据,我们还可以指明说,我们就要保证邮政编码是字符串型的,不要给我整些数值型出来!!

na_values = ['NO CLUE','N/A','0']
requests = pd.read_csv('311-service-requests.csv',na_values=na_values,dtype={'Incident Zip':str})
print(requests['Incident Zip'].unique())
输出:
['11432' '11378' '10032' '10023' '10027' '11372' '11419' '11417' '10011'
 '11225' '11218' '10003' '10029' '10466' '11219' '10025' '10310' '11236'
 nan '10033' '11216' '10016' '10305' '10312' '10026' '10309' '10036'
 '11433' '11235' '11213' '11379' '11101' '10014' '11231' '11234' '10457'
 '10459' '10465' '11207' '10002' '10034' '11233' '10453' '10456' '10469'
 '11374' '11221' '11421' '11215' '10007' '10019' '11205' '11418' '11369'
 '11249' '10005' '10009' '11211' '11412' '10458' '11229' '10065' '10030'
 '11222' '10024' '10013' '11420' '11365' '10012' '11214' '11212' '10022'
 '11232' '11040' '11226' '10281' '11102' '11208' '10001' '10472' '11414'
 '11223' '10040' '11220' '11373' '11203' '11691' '11356' '10017' '10452'
 '10280' '11217' '10031' '11201' '11358' '10128' '11423' '10039' '10010'
 '11209' '10021' '10037' '11413' '11375' '11238' '10473' '11103' '11354'
 '11361' '11106' '11385' '10463' '10467' '11204' '11237' '11377' '11364'
 '11434' '11435' '11210' '11228' '11368' '11694' '10464' '11415' '10314'
 '10301' '10018' '10038' '11105' '11230' '10468' '11104' '10471' '11416'
 '10075' '11422' '11355' '10028' '10462' '10306' '10461' '11224' '11429'
 '10035' '11366' '11362' '11206' '10460' '10304' '11360' '11411' '10455'
 '10475' '10069' '10303' '10308' '10302' '11357' '10470' '11367' '11370'
 '10454' '10451' '11436' '11426' '10153' '11004' '11428' '11427' '11001'
 '11363' '10004' '10474' '11430' '10000' '10307' '11239' '10119' '10006'
 '10048' '11697' '11692' '11693' '10573' '00083' '11559' '10020' '77056'
 '11776' '70711' '10282' '11109' '10044' '02061' '77092-2016' '14225'
 '55164-0737' '19711' '07306' '000000' '90010' '11747' '23541' '11788'
 '07604' '10112' '11563' '11580' '07087' '11042' '07093' '11501' '92123'
 '00000' '11575' '07109' '11797' '10803' '11716' '11722' '11549-3650'
 '10162' '23502' '11518' '07020' '08807' '11577' '07114' '11003' '07201'
 '61702' '10103' '29616-0759' '35209-3114' '11520' '11735' '10129' '11005'
 '41042' '11590' '06901' '07208' '11530' '13221' '10954' '11111' '10107']

6.4 那些用“-”连接的邮编是什么鬼?

xx = requests.loc[requests['Incident Zip'].str.contains('-').fillna(False),'Incident Zip']
print(xx)
rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
print(len(requests[rows_with_dashes]))
# 真心是很烦人啊,其实只有5个,输出来看看是什么
print(requests[rows_with_dashes])
输出:
29136    77092-2016
30939    55164-0737
70539    11549-3650
85821    29616-0759
89304    35209-3114
Name: Incident Zip, dtype: object
5
       Unique Key            Created Date             Closed Date Agency                     Agency Name      Complaint Type         Descriptor Location Type Incident Zip        Incident Address         Street Name Cross Street 1 Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type        City Landmark Facility Type    Status                Due Date Resolution Action Updated Date Community Board      Borough  X Coordinate (State Plane)  Y Coordinate (State Plane) Park Facility Name Park Borough  School Name School Number School Region  School Code School Phone Number School Address  School City School State   School Zip School Not Found  School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name  Latitude  Longitude Location
29136    26550551  10/24/2013 06:16:34 PM                     NaN    DCA  Department of Consumer Affairs  Consumer Complaint  False Advertising           NaN   77092-2016   2700 EAST SELTICE WAY    EAST SELTICE WAY            NaN            NaN                   NaN                   NaN          NaN     HOUSTON      NaN           NaN  Assigned  11/13/2013 11:15:20 AM         10/29/2013 11:16:16 AM   0 Unspecified  Unspecified                         NaN                         NaN        Unspecified  Unspecified  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN       NaN        NaN      NaN
30939    26548831  10/24/2013 09:35:10 AM                     NaN    DCA  Department of Consumer Affairs  Consumer Complaint         Harassment           NaN   55164-0737          P.O. BOX 64437               64437            NaN            NaN                   NaN                   NaN          NaN    ST. PAUL      NaN           NaN  Assigned  11/13/2013 02:30:21 PM         10/29/2013 02:31:06 PM   0 Unspecified  Unspecified                         NaN                         NaN        Unspecified  Unspecified  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN       NaN        NaN      NaN
70539    26488417  10/15/2013 03:40:33 PM                     NaN    TLC   Taxi and Limousine Commission      Taxi Complaint   Driver Complaint        Street   11549-3650  365 HOFSTRA UNIVERSITY  HOFSTRA UNIVERSITY            NaN            NaN                   NaN                   NaN          NaN    HEMSTEAD      NaN           NaN  Assigned  11/30/2013 01:20:33 PM         10/16/2013 01:21:39 PM   0 Unspecified  Unspecified                         NaN                         NaN        Unspecified  Unspecified  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN    La Guardia Airport                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN       NaN        NaN      NaN
85821    26468296  10/10/2013 12:36:43 PM  10/26/2013 01:07:07 AM    DCA  Department of Consumer Affairs  Consumer Complaint      Debt Not Owed           NaN   29616-0759            PO BOX 25759           BOX 25759            NaN            NaN                   NaN                   NaN          NaN  GREENVILLE      NaN           NaN    Closed  10/26/2013 09:20:28 AM         10/26/2013 01:07:07 AM   0 Unspecified  Unspecified                         NaN                         NaN        Unspecified  Unspecified  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN       NaN        NaN      NaN
89304    26461137  10/09/2013 05:23:46 PM  10/25/2013 01:06:41 AM    DCA  Department of Consumer Affairs  Consumer Complaint         Harassment           NaN   35209-3114         600 BEACON PKWY         BEACON PKWY            NaN            NaN                   NaN                   NaN          NaN  BIRMINGHAM      NaN           NaN    Closed  10/25/2013 02:43:42 PM         10/25/2013 01:06:41 AM   0 Unspecified  Unspecified                         NaN                         NaN        Unspecified  Unspecified  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                   NaN                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN       NaN        NaN      NaN

本来就5个,打算直接把这些都设置成缺省值(nan)的:requests[‘Incident Zip’][rows_with_dashes] = np.nan 后来查了查,发现可能前5位置是真实的邮编,所以干脆截取一下好了。

long_zip_codes = requests['Incident Zip'].str.len() > 5
print(requests['Incident Zip'][long_zip_codes].unique())
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0,5)

输出:
['77092-2016' '55164-0737' '000000' '11549-3650' '29616-0759' '35209-3114']

搞定啦!

妈蛋查了下00000,发现根本不是什么美国加拿大的邮编,所以这个是不能这么处理的,还真得重新设为缺省值。

print(requests[requests['Incident Zip'] == '00000'])
zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips,'Incident Zip'] = np.nan
# 完工!!再来看看现在的数据什么样了。
unique_zips = requests['Incident Zip'].unique()
#unique_zips.sort_values()
print(unique_zips)
输出:
      Unique Key            Created Date Closed Date Agency                    Agency Name  Complaint Type        Descriptor Location Type Incident Zip  Incident Address     Street Name Cross Street 1 Cross Street 2 Intersection Street 1 Intersection Street 2 Address Type    City Landmark Facility Type    Status                Due Date Resolution Action Updated Date Community Board      Borough  X Coordinate (State Plane)  Y Coordinate (State Plane) Park Facility Name Park Borough  School Name School Number School Region  School Code School Phone Number School Address  School City School State   School Zip School Not Found  School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name  Latitude  Longitude Location
42600    26529313  10/22/2013 02:51:06 PM         NaN    TLC  Taxi and Limousine Commission  Taxi Complaint  Driver Complaint           NaN        00000           EWR EWR             EWR            NaN            NaN                   NaN                   NaN          NaN  NEWARK      NaN           NaN  Assigned  12/07/2013 09:53:51 AM         10/23/2013 09:54:43 AM   0 Unspecified  Unspecified                         NaN                         NaN        Unspecified  Unspecified  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                 Other                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN       NaN        NaN      NaN
60843    26507389  10/17/2013 05:48:44 PM         NaN    TLC  Taxi and Limousine Commission  Taxi Complaint  Driver Complaint        Street        00000  1 NEWARK AIRPORT  NEWARK AIRPORT            NaN            NaN                   NaN                   NaN          NaN  NEWARK      NaN           NaN  Assigned  12/02/2013 11:59:46 AM         10/18/2013 12:01:08 PM   0 Unspecified  Unspecified                         NaN                         NaN        Unspecified  Unspecified  Unspecified   Unspecified   Unspecified  Unspecified         Unspecified    Unspecified  Unspecified  Unspecified  Unspecified                N                           NaN          NaN                  NaN                 Other                 NaN                      NaN       NaN                    NaN             NaN             NaN                 NaN       NaN        NaN      NaN
['11432' '11378' '10032' '10023' '10027' '11372' '11419' '11417' '10011'
 '11225' '11218' '10003' '10029' '10466' '11219' '10025' '10310' '11236'
 nan '10033' '11216' '10016' '10305' '10312' '10026' '10309' '10036'
 '11433' '11235' '11213' '11379' '11101' '10014' '11231' '11234' '10457'
 '10459' '10465' '11207' '10002' '10034' '11233' '10453' '10456' '10469'
 '11374' '11221' '11421' '11215' '10007' '10019' '11205' '11418' '11369'
 '11249' '10005' '10009' '11211' '11412' '10458' '11229' '10065' '10030'
 '11222' '10024' '10013' '11420' '11365' '10012' '11214' '11212' '10022'
 '11232' '11040' '11226' '10281' '11102' '11208' '10001' '10472' '11414'
 '11223' '10040' '11220' '11373' '11203' '11691' '11356' '10017' '10452'
 '10280' '11217' '10031' '11201' '11358' '10128' '11423' '10039' '10010'
 '11209' '10021' '10037' '11413' '11375' '11238' '10473' '11103' '11354'
 '11361' '11106' '11385' '10463' '10467' '11204' '11237' '11377' '11364'
 '11434' '11435' '11210' '11228' '11368' '11694' '10464' '11415' '10314'
 '10301' '10018' '10038' '11105' '11230' '10468' '11104' '10471' '11416'
 '10075' '11422' '11355' '10028' '10462' '10306' '10461' '11224' '11429'
 '10035' '11366' '11362' '11206' '10460' '10304' '11360' '11411' '10455'
 '10475' '10069' '10303' '10308' '10302' '11357' '10470' '11367' '11370'
 '10454' '10451' '11436' '11426' '10153' '11004' '11428' '11427' '11001'
 '11363' '10004' '10474' '11430' '10000' '10307' '11239' '10119' '10006'
 '10048' '11697' '11692' '11693' '10573' '00083' '11559' '10020' '77056'
 '11776' '70711' '10282' '11109' '10044' '02061' '77092' '14225' '55164'
 '19711' '07306' '90010' '11747' '23541' '11788' '07604' '10112' '11563'
 '11580' '07087' '11042' '07093' '11501' '92123' '11575' '07109' '11797'
 '10803' '11716' '11722' '11549' '10162' '23502' '11518' '07020' '08807'
 '11577' '07114' '11003' '07201' '61702' '10103' '29616' '35209' '11520'
 '11735' '10129' '11005' '41042' '11590' '06901' '07208' '11530' '13221'
 '10954' '11111' '10107']

看起来干净多了。
但是真的做完了吗?

zips = requests['Incident Zip']
# 用is_close表示0或者1开始的比较正确的邮编
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# 非缺省值但不以0或者1开始的邮编认为是有些困惑的
is_far = ~(is_close) & zips.notnull()
print(zips[is_far])
# 可以排个序,然后对应输出一些东西
requests_zip = requests[is_far][['Incident Zip','Descriptor','City']].sort_values('Incident Zip')
print(requests_zip)
输出:
12102    77056
13450    70711
29136    77092
30939    55164
44008    90010
47048    23541
57636    92123
71001    92123
71834    23502
80573    61702
85821    29616
89304    35209
94201    41042
Name: Incident Zip, dtype: object
      Incident Zip         Descriptor         City
71834        23502         Harassment      NORFOLK
47048        23541         Harassment      NORFOLK
85821        29616      Debt Not Owed   GREENVILLE
89304        35209         Harassment   BIRMINGHAM
94201        41042         Harassment     FLORENCE
30939        55164         Harassment     ST. PAUL
80573        61702    Billing Dispute   BLOOMIGTON
13450        70711   Contract Dispute      CLIFTON
12102        77056      Debt Not Owed      HOUSTON
29136        77092  False Advertising      HOUSTON
44008        90010    Billing Dispute  LOS ANGELES
57636        92123         Harassment    SAN DIEGO
71001        92123    Billing Dispute    SAN DIEGO

咳咳,突然觉得,恩,刚才做的一大堆工作,其实只是告诉你,我们可以这样去处理和补齐数据。
但你实际上会发现,好像其实用city直接对应一下就可以补上一些东西啊。

city_counts = requests['City'].str.upper().value_counts()
print(city_counts)
输出:
BROOKLYN               31662
NEW YORK               22664
BRONX                  18438
STATEN ISLAND           4766
JAMAICA                 2246
FLUSHING                1803
ASTORIA                 1568
RIDGEWOOD               1073
CORONA                   707
OZONE PARK               693
LONG ISLAND CITY         678
FAR ROCKAWAY             652
ELMHURST                 647
WOODSIDE                 609
EAST ELMHURST            562
QUEENS VILLAGE           549
FOREST HILLS             541
JACKSON HEIGHTS          541
SOUTH RICHMOND HILL      521
MASPETH                  473
WOODHAVEN                464
FRESH MEADOWS            435
SPRINGFIELD GARDENS      434
BAYSIDE                  411
SOUTH OZONE PARK         410
RICHMOND HILL            404
REGO PARK                402
MIDDLE VILLAGE           396
SAINT ALBANS             387
WHITESTONE               348
                       ...  
WEST NEW YORK              1
UNION CITY                 1
LOS ANGELES                1
WOODBURY                   1
BRIDGE  WATER              1
NEW YOR                    1
FREEPORT                   1
COL.ANVURES                1
BRIARWOOD                  1
JERSEY CITY                1
ROSELYN                    1
NANUET                     1
HASBROCK HEIGHTS           1
SYRACUSE                   1
NEWARK AIRPORT             1
BLOOMIGTON                 1
CENTRAL ISLIP              1
EDGEWATER                  1
ROSLYN                     1
NORWELL                    1
STAMFORD                   1
FLORENCE                   1
EAST ROCKAWAY              1
BIRMINGHAM                 1
RYEBROOK                   1
HEMSTEAD                   1
LYNBROOK                   1
BOHIEMA                    1
LAWRENCE                   1
ELIZABETH                  1
Name: City, Length: 100, dtype: int64

6.5 汇个总

所以汇总一下,我们在邮编这个字段,是这样做数据清洗的:

na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('311-service-requests.csv',na_values = na_values,dtype={'Incident Zip': str})
def fix_zip_codes(zips):
    # Truncate everything to length 5
    zips = zips.str.slice(0,5)
    # Set 00000 zip codes to nan
    zero_zips = zips == '00000'
    zips[zero_zips] = np.nan

    return zips

requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])
result = requests['Incident Zip'].unique()
print(result)
输出:
['11432' '11378' '10032' '10023' '10027' '11372' '11419' '11417' '10011'
 '11225' '11218' '10003' '10029' '10466' '11219' '10025' '10310' '11236'
 nan '10033' '11216' '10016' '10305' '10312' '10026' '10309' '10036'
 '11433' '11235' '11213' '11379' '11101' '10014' '11231' '11234' '10457'
 '10459' '10465' '11207' '10002' '10034' '11233' '10453' '10456' '10469'
 '11374' '11221' '11421' '11215' '10007' '10019' '11205' '11418' '11369'
 '11249' '10005' '10009' '11211' '11412' '10458' '11229' '10065' '10030'
 '11222' '10024' '10013' '11420' '11365' '10012' '11214' '11212' '10022'
 '11232' '11040' '11226' '10281' '11102' '11208' '10001' '10472' '11414'
 '11223' '10040' '11220' '11373' '11203' '11691' '11356' '10017' '10452'
 '10280' '11217' '10031' '11201' '11358' '10128' '11423' '10039' '10010'
 '11209' '10021' '10037' '11413' '11375' '11238' '10473' '11103' '11354'
 '11361' '11106' '11385' '10463' '10467' '11204' '11237' '11377' '11364'
 '11434' '11435' '11210' '11228' '11368' '11694' '10464' '11415' '10314'
 '10301' '10018' '10038' '11105' '11230' '10468' '11104' '10471' '11416'
 '10075' '11422' '11355' '10028' '10462' '10306' '10461' '11224' '11429'
 '10035' '11366' '11362' '11206' '10460' '10304' '11360' '11411' '10455'
 '10475' '10069' '10303' '10308' '10302' '11357' '10470' '11367' '11370'
 '10454' '10451' '11436' '11426' '10153' '11004' '11428' '11427' '11001'
 '11363' '10004' '10474' '11430' '10000' '10307' '11239' '10119' '10006'
 '10048' '11697' '11692' '11693' '10573' '00083' '11559' '10020' '77056'
 '11776' '70711' '10282' '11109' '10044' '02061' '77092' '14225' '55164'
 '19711' '07306' '90010' '11747' '23541' '11788' '07604' '10112' '11563'
 '11580' '07087' '11042' '07093' '11501' '92123' '11575' '07109' '11797'
 '10803' '11716' '11722' '11549' '10162' '23502' '11518' '07020' '08807'
 '11577' '07114' '11003' '07201' '61702' '10103' '29616' '35209' '11520'
 '11735' '10129' '11005' '41042' '11590' '06901' '07208' '11530' '13221'
 '10954' '11111' '10107']

6.6 知识点补充

pandas.read_csv(filepath_or_buffer, sep=', ', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)

sep=','   # 以,为数据分隔符
shkiprows= 10   # 跳过前十行
nrows = 10   # 只去前10行
parse_dates = ['col_name']   # 指定某行读取为日期格式
index_col = ['col_1','col_2']   # 读取指定的几列
error_bad_lines = False   # 当某行数据有问题时,不报错,直接跳过,处理脏数据时使用
na_values = 'NULL'   # 将NULL识别为空值

数据链接:链接:https://pan.baidu.com/s/1BX9z2WeAHrA0hduuf1OD0A
提取码:o2o8
来自七月在线数据挖掘算法
向林
2020年2月13日于重庆城口
好好学习,天天向上,终有所获

 类似资料: