Pandas 秘籍 - 第七章

优质

小牛编辑

138浏览

2023-12-01

# 通常的开头
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# 使图表更大更漂亮
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'
# 在 Pandas 0.12 中需要展示大量的列 
# 在 Pandas 0.13 中不需要
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

杂乱数据的主要问题之一是：你怎么知道它是否杂乱呢？

我们将在这里使用 NYC 311 服务请求数据集，因为它很大，有点不方便。

requests = pd.read_csv('../data/311-service-requests.csv')

7.1 我怎么知道它是否杂乱？

我们在这里查看几列。我知道邮政编码有一些问题，所以让我们先看看它。

要了解列是否有问题，我通常使用.unique()来查看所有的值。如果它是一列数字，我将绘制一个直方图来获得分布的感觉。

当我们看看Incident Zip中的唯一值时，很快就会清楚这是一个混乱。

一些问题：

一些已经解析为字符串，一些是浮点
存在nan
部分邮政编码为29616-0759或83
有一些 Pandas 无法识别的 N/A 值，如'N/A'和'NO CLUE'

我们可以做的事情：

将N/A和NO CLUE规格化为nan值
看看 83 处发生了什么，并决定做什么
将一切转化为字符串

requests['Incident Zip'].unique()

array([11432.0, 11378.0, 10032.0, 10023.0, 10027.0, 11372.0, 11419.0,
       11417.0, 10011.0, 11225.0, 11218.0, 10003.0, 10029.0, 10466.0,
       11219.0, 10025.0, 10310.0, 11236.0, nan, 10033.0, 11216.0, 10016.0,
       10305.0, 10312.0, 10026.0, 10309.0, 10036.0, 11433.0, 11235.0,
       11213.0, 11379.0, 11101.0, 10014.0, 11231.0, 11234.0, 10457.0,
       10459.0, 10465.0, 11207.0, 10002.0, 10034.0, 11233.0, 10453.0,
       10456.0, 10469.0, 11374.0, 11221.0, 11421.0, 11215.0, 10007.0,
       10019.0, 11205.0, 11418.0, 11369.0, 11249.0, 10005.0, 10009.0,
       11211.0, 11412.0, 10458.0, 11229.0, 10065.0, 10030.0, 11222.0,
       10024.0, 10013.0, 11420.0, 11365.0, 10012.0, 11214.0, 11212.0,
       10022.0, 11232.0, 11040.0, 11226.0, 10281.0, 11102.0, 11208.0,
       10001.0, 10472.0, 11414.0, 11223.0, 10040.0, 11220.0, 11373.0,
       11203.0, 11691.0, 11356.0, 10017.0, 10452.0, 10280.0, 11217.0,
       10031.0, 11201.0, 11358.0, 10128.0, 11423.0, 10039.0, 10010.0,
       11209.0, 10021.0, 10037.0, 11413.0, 11375.0, 11238.0, 10473.0,
       11103.0, 11354.0, 11361.0, 11106.0, 11385.0, 10463.0, 10467.0,
       11204.0, 11237.0, 11377.0, 11364.0, 11434.0, 11435.0, 11210.0,
       11228.0, 11368.0, 11694.0, 10464.0, 11415.0, 10314.0, 10301.0,
       10018.0, 10038.0, 11105.0, 11230.0, 10468.0, 11104.0, 10471.0,
       11416.0, 10075.0, 11422.0, 11355.0, 10028.0, 10462.0, 10306.0,
       10461.0, 11224.0, 11429.0, 10035.0, 11366.0, 11362.0, 11206.0,
       10460.0, 10304.0, 11360.0, 11411.0, 10455.0, 10475.0, 10069.0,
       10303.0, 10308.0, 10302.0, 11357.0, 10470.0, 11367.0, 11370.0,
       10454.0, 10451.0, 11436.0, 11426.0, 10153.0, 11004.0, 11428.0,
       11427.0, 11001.0, 11363.0, 10004.0, 10474.0, 11430.0, 10000.0,
       10307.0, 11239.0, 10119.0, 10006.0, 10048.0, 11697.0, 11692.0,
       11693.0, 10573.0, 83.0, 11559.0, 10020.0, 77056.0, 11776.0, 70711.0,
       10282.0, 11109.0, 10044.0, '10452', '11233', '10468', '10310',
       '11105', '10462', '10029', '10301', '10457', '10467', '10469',
       '11225', '10035', '10031', '11226', '10454', '11221', '10025',
       '11229', '11235', '11422', '10472', '11208', '11102', '10032',
       '11216', '10473', '10463', '11213', '10040', '10302', '11231',
       '10470', '11204', '11104', '11212', '10466', '11416', '11214',
       '10009', '11692', '11385', '11423', '11201', '10024', '11435',
       '10312', '10030', '11106', '10033', '10303', '11215', '11222',
       '11354', '10016', '10034', '11420', '10304', '10019', '11237',
       '11249', '11230', '11372', '11207', '11378', '11419', '11361',
       '10011', '11357', '10012', '11358', '10003', '10002', '11374',
       '10007', '11234', '10065', '11369', '11434', '11205', '11206',
       '11415', '11236', '11218', '11413', '10458', '11101', '10306',
       '11355', '10023', '11368', '10314', '11421', '10010', '10018',
       '11223', '10455', '11377', '11433', '11375', '10037', '11209',
       '10459', '10128', '10014', '10282', '11373', '10451', '11238',
       '11211', '10038', '11694', '11203', '11691', '11232', '10305',
       '10021', '11228', '10036', '10001', '10017', '11217', '11219',
       '10308', '10465', '11379', '11414', '10460', '11417', '11220',
       '11366', '10027', '11370', '10309', '11412', '11356', '10456',
       '11432', '10022', '10013', '11367', '11040', '10026', '10475',
       '11210', '11364', '11426', '10471', '10119', '11224', '11418',
       '11429', '11365', '10461', '11239', '10039', '00083', '11411',
       '10075', '11004', '11360', '10453', '10028', '11430', '10307',
       '11103', '10004', '10069', '10005', '10474', '11428', '11436',
       '10020', '11001', '11362', '11693', '10464', '11427', '10044',
       '11363', '10006', '10000', '02061', '77092-2016', '10280', '11109',
       '14225', '55164-0737', '19711', '07306', '000000', 'NO CLUE',
       '90010', '10281', '11747', '23541', '11776', '11697', '11788',
       '07604', 10112.0, 11788.0, 11563.0, 11580.0, 7087.0, 11042.0,
       7093.0, 11501.0, 92123.0, 0.0, 11575.0, 7109.0, 11797.0, '10803',
       '11716', '11722', '11549-3650', '10162', '92123', '23502', '11518',
       '07020', '08807', '11577', '07114', '11003', '07201', '11563',
       '61702', '10103', '29616-0759', '35209-3114', '11520', '11735',
       '10129', '11005', '41042', '11590', 6901.0, 7208.0, 11530.0,
       13221.0, 10954.0, 11735.0, 10103.0, 7114.0, 11111.0, 10107.0], dtype=object)

7.3 修复`nan`值和字符串/浮点混淆

我们可以将na_values选项传递到pd.read_csv来清理它们。我们还可以指定Incident Zip的类型是字符串，而不是浮点。

na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})

requests['Incident Zip'].unique()

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209', '10021', '10037', '11413', '11375', '11238', '10473',
       '11103', '11354', '11361', '11106', '11385', '10463', '10467',
       '11204', '11237', '11377', '11364', '11434', '11435', '11210',
       '11228', '11368', '11694', '10464', '11415', '10314', '10301',
       '10018', '10038', '11105', '11230', '10468', '11104', '10471',
       '11416', '10075', '11422', '11355', '10028', '10462', '10306',
       '10461', '11224', '11429', '10035', '11366', '11362', '11206',
       '10460', '10304', '11360', '11411', '10455', '10475', '10069',
       '10303', '10308', '10302', '11357', '10470', '11367', '11370',
       '10454', '10451', '11436', '11426', '10153', '11004', '11428',
       '11427', '11001', '11363', '10004', '10474', '11430', '10000',
       '10307', '11239', '10119', '10006', '10048', '11697', '11692',
       '11693', '10573', '00083', '11559', '10020', '77056', '11776',
       '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225',
       '55164-0737', '19711', '07306', '000000', '90010', '11747', '23541',
       '11788', '07604', '10112', '11563', '11580', '07087', '11042',
       '07093', '11501', '92123', '00000', '11575', '07109', '11797',
       '10803', '11716', '11722', '11549-3650', '10162', '23502', '11518',
       '07020', '08807', '11577', '07114', '11003', '07201', '61702',
       '10103', '29616-0759', '35209-3114', '11520', '11735', '10129',
       '11005', '41042', '11590', '06901', '07208', '11530', '13221',
       '10954', '11111', '10107'], dtype=object)

7.4 短横线处发生了什么

rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
len(requests[rows_with_dashes])

requests[rows_with_dashes]

	Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
29136	26550551	10/24/2013 06:16:34 PM	NaN	DCA	Department of Consumer Affairs	Consumer Complaint	False Advertising	NaN	77092-2016	2700 EAST SELTICE WAY	EAST SELTICE WAY	NaN	NaN	NaN	NaN	NaN	HOUSTON	NaN	NaN	Assigned	11/13/2013 11:15:20 AM	10/29/2013 11:16:16 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
30939	26548831	10/24/2013 09:35:10 AM	NaN	DCA	Department of Consumer Affairs	Consumer Complaint	Harassment	NaN	55164-0737	P.O. BOX 64437	64437	NaN	NaN	NaN	NaN	NaN	ST. PAUL	NaN	NaN	Assigned	11/13/2013 02:30:21 PM	10/29/2013 02:31:06 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
70539	26488417	10/15/2013 03:40:33 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	Street	11549-3650	365 HOFSTRA UNIVERSITY	HOFSTRA UNIVERSITY	NaN	NaN	NaN	NaN	NaN	HEMSTEAD	NaN	NaN	Assigned	11/30/2013 01:20:33 PM	10/16/2013 01:21:39 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	La Guardia Airport	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
85821	26468296	10/10/2013 12:36:43 PM	10/26/2013 01:07:07 AM	DCA	Department of Consumer Affairs	Consumer Complaint	Debt Not Owed	NaN	29616-0759	PO BOX 25759	BOX 25759	NaN	NaN	NaN	NaN	NaN	GREENVILLE	NaN	NaN	Closed	10/26/2013 09:20:28 AM	10/26/2013 01:07:07 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
89304	26461137	10/09/2013 05:23:46 PM	10/25/2013 01:06:41 AM	DCA	Department of Consumer Affairs	Consumer Complaint	Harassment	NaN	35209-3114	600 BEACON PKWY	BEACON PKWY	NaN	NaN	NaN	NaN	NaN	BIRMINGHAM	NaN	NaN	Closed	10/25/2013 02:43:42 PM	10/25/2013 01:06:41 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

我认为这些都是缺失的数据，像这样删除它们：

requests['Incident Zip'][rows_with_dashes] = np.nan

但是我的朋友 Dave 指出，9 位邮政编码是正常的。让我们看看所有超过 5 位数的邮政编码，确保它们没问题，然后截断它们。

long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()

array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759',
       '35209-3114'], dtype=object)

这些看起来可以截断：

requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)

就可以了。

早些时候我认为 00083 是一个损坏的邮政编码，但事实证明中央公园的邮政编码是 00083！显示我知道的吧。我仍然关心 00000 邮政编码，但是：让我们看看。

requests[requests['Incident Zip'] == '00000']

	Unique Key	Created Date	Closed Date	Agency	Agency Name	Complaint Type	Descriptor	Location Type	Incident Zip	Incident Address	Street Name	Cross Street 1	Cross Street 2	Intersection Street 1	Intersection Street 2	Address Type	City	Landmark	Facility Type	Status	Due Date	Resolution Action Updated Date	Community Board	Borough	X Coordinate (State Plane)	Y Coordinate (State Plane)	Park Facility Name	Park Borough	School Name	School Number	School Region	School Code	School Phone Number	School Address	School City	School State	School Zip	School Not Found	School or Citywide Complaint	Vehicle Type	Taxi Company Borough	Taxi Pick Up Location	Bridge Highway Name	Bridge Highway Direction	Road Ramp	Bridge Highway Segment	Garage Lot Name	Ferry Direction	Ferry Terminal Name	Latitude	Longitude	Location
42600	26529313	10/22/2013 02:51:06 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	NaN	00000	EWR EWR	EWR	NaN	NaN	NaN	NaN	NaN	NEWARK	NaN	NaN	Assigned	12/07/2013 09:53:51 AM	10/23/2013 09:54:43 AM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	Other	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
60843	26507389	10/17/2013 05:48:44 PM	NaN	TLC	Taxi and Limousine Commission	Taxi Complaint	Driver Complaint	Street	00000	1 NEWARK AIRPORT	NEWARK AIRPORT	NaN	NaN	NaN	NaN	NaN	NEWARK	NaN	NaN	Assigned	12/02/2013 11:59:46 AM	10/18/2013 12:01:08 PM	0 Unspecified	Unspecified	NaN	NaN	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	Unspecified	N	NaN	NaN	NaN	Other	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

这看起来对我来说很糟糕，让我将它们设为NaN。

zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips, 'Incident Zip'] = np.nan

太棒了，让我们看看现在在哪里。

unique_zips = requests['Incident Zip'].unique()
unique_zips.sort()
unique_zips

array([nan, '00083', '02061', '06901', '07020', '07087', '07093', '07109',
       '07114', '07201', '07208', '07306', '07604', '08807', '10000',
       '10001', '10002', '10003', '10004', '10005', '10006', '10007',
       '10009', '10010', '10011', '10012', '10013', '10014', '10016',
       '10017', '10018', '10019', '10020', '10021', '10022', '10023',
       '10024', '10025', '10026', '10027', '10028', '10029', '10030',
       '10031', '10032', '10033', '10034', '10035', '10036', '10037',
       '10038', '10039', '10040', '10044', '10048', '10065', '10069',
       '10075', '10103', '10107', '10112', '10119', '10128', '10129',
       '10153', '10162', '10280', '10281', '10282', '10301', '10302',
       '10303', '10304', '10305', '10306', '10307', '10308', '10309',
       '10310', '10312', '10314', '10451', '10452', '10453', '10454',
       '10455', '10456', '10457', '10458', '10459', '10460', '10461',
       '10462', '10463', '10464', '10465', '10466', '10467', '10468',
       '10469', '10470', '10471', '10472', '10473', '10474', '10475',
       '10573', '10803', '10954', '11001', '11003', '11004', '11005',
       '11040', '11042', '11101', '11102', '11103', '11104', '11105',
       '11106', '11109', '11111', '11201', '11203', '11204', '11205',
       '11206', '11207', '11208', '11209', '11210', '11211', '11212',
       '11213', '11214', '11215', '11216', '11217', '11218', '11219',
       '11220', '11221', '11222', '11223', '11224', '11225', '11226',
       '11228', '11229', '11230', '11231', '11232', '11233', '11234',
       '11235', '11236', '11237', '11238', '11239', '11249', '11354',
       '11355', '11356', '11357', '11358', '11360', '11361', '11362',
       '11363', '11364', '11365', '11366', '11367', '11368', '11369',
       '11370', '11372', '11373', '11374', '11375', '11377', '11378',
       '11379', '11385', '11411', '11412', '11413', '11414', '11415',
       '11416', '11417', '11418', '11419', '11420', '11421', '11422',
       '11423', '11426', '11427', '11428', '11429', '11430', '11432',
       '11433', '11434', '11435', '11436', '11501', '11518', '11520',
       '11530', '11549', '11559', '11563', '11575', '11577', '11580',
       '11590', '11691', '11692', '11693', '11694', '11697', '11716',
       '11722', '11735', '11747', '11776', '11788', '11797', '13221',
       '14225', '19711', '23502', '23541', '29616', '35209', '41042',
       '55164', '61702', '70711', '77056', '77092', '90010', '92123'], dtype=object)

太棒了！这更加干净。虽然这里有一些奇怪的东西 - 我在谷歌地图上查找 77056，这是在德克萨斯州。

让我们仔细看看：

zips = requests['Incident Zip']
# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False
is_far = ~(is_close) & zips.notnull()

zips[is_far]

12102    77056
13450    70711
29136    77092
30939    55164
44008    90010
47048    23541
57636    92123
71001    92123
71834    23502
80573    61702
85821    29616
89304    35209
94201    41042
Name: Incident Zip, dtype: object

requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort('Incident Zip')

	Incident Zip	Descriptor	City
71834	23502	Harassment	NORFOLK
47048	23541	Harassment	NORFOLK
85821	29616	Debt Not Owed	GREENVILLE
89304	35209	Harassment	BIRMINGHAM
94201	41042	Harassment	FLORENCE
30939	55164	Harassment	ST. PAUL
80573	61702	Billing Dispute	BLOOMIGTON
13450	70711	Contract Dispute	CLIFTON
12102	77056	Debt Not Owed	HOUSTON
29136	77092	False Advertising	HOUSTON
44008	90010	Billing Dispute	LOS ANGELES
57636	92123	Harassment	SAN DIEGO
71001	92123	Billing Dispute	SAN DIEGO

好吧，真的有来自 LA 和休斯敦的请求！很高兴知道它们。按邮政编码过滤可能是处理它的一个糟糕的方式 - 我们真的应该看着城市。

requests['City'].str.upper().value_counts()

BROOKLYN            31662
NEW YORK            22664
BRONX               18438
STATEN ISLAND        4766
JAMAICA              2246
FLUSHING             1803
ASTORIA              1568
RIDGEWOOD            1073
CORONA                707
OZONE PARK            693
LONG ISLAND CITY      678
FAR ROCKAWAY          652
ELMHURST              647
WOODSIDE              609
EAST ELMHURST         562
...
MELVILLE                  1
PORT JEFFERSON STATION    1
NORWELL                   1
EAST ROCKAWAY             1
BIRMINGHAM                1
ROSLYN                    1
LOS ANGELES               1
MINEOLA                   1
JERSEY CITY               1
ST. PAUL                  1
CLIFTON                   1
COL.ANVURES               1
EDGEWATER                 1
ROSELYN                   1
CENTRAL ISLIP             1
Length: 100, dtype: int64

看起来这些是合法的投诉，所以我们只是把它们放在一边。

7.5 把它们放到一起

这里是我们最后所做的事情，用于清理我们的邮政编码，都在一起：

na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', 
                       na_values=na_values, 
                       dtype={'Incident Zip': str})

def fix_zip_codes(zips):
    # Truncate everything to length 5 
    zips = zips.str.slice(0, 5)
    # Set 00000 zip codes to nan
    zero_zips = zips == '00000'
    zips[zero_zips] = np.nan
    return zips

requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])

requests['Incident Zip'].unique()

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209', '10021', '10037', '11413', '11375', '11238', '10473',
       '11103', '11354', '11361', '11106', '11385', '10463', '10467',
       '11204', '11237', '11377', '11364', '11434', '11435', '11210',
       '11228', '11368', '11694', '10464', '11415', '10314', '10301',
       '10018', '10038', '11105', '11230', '10468', '11104', '10471',
       '11416', '10075', '11422', '11355', '10028', '10462', '10306',
       '10461', '11224', '11429', '10035', '11366', '11362', '11206',
       '10460', '10304', '11360', '11411', '10455', '10475', '10069',
       '10303', '10308', '10302', '11357', '10470', '11367', '11370',
       '10454', '10451', '11436', '11426', '10153', '11004', '11428',
       '11427', '11001', '11363', '10004', '10474', '11430', '10000',
       '10307', '11239', '10119', '10006', '10048', '11697', '11692',
       '11693', '10573', '00083', '11559', '10020', '77056', '11776',
       '70711', '10282', '11109', '10044', '02061', '77092', '14225',
       '55164', '19711', '07306', '90010', '11747', '23541', '11788',
       '07604', '10112', '11563', '11580', '07087', '11042', '07093',
       '11501', '92123', '11575', '07109', '11797', '10803', '11716',
       '11722', '11549', '10162', '23502', '11518', '07020', '08807',
       '11577', '07114', '11003', '07201', '61702', '10103', '29616',
       '35209', '11520', '11735', '10129', '11005', '41042', '11590',
       '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)

Pandas 秘籍 - 第七章

7.1 我怎么知道它是否杂乱？

7.3 修复nan值和字符串/浮点混淆

7.4 短横线处发生了什么

7.5 把它们放到一起

7.3 修复`nan`值和字符串/浮点混淆