text_type = agate.Text()
number_type = agate.Number()
boolean_type = agate.Boolean()
data_type = agate.Date()
数据类型测试方法 ctype
for v in example_row:
value_type = ctype_text[v.ctype]
例:
#测试
example_row = sheet.row(6)
print('example_row: ',example_row)
#print('slice方法: ',sheet.row_slice(6))
print('example_row[0].ctype: ',example_row[0].ctype)
print('example_row[0].value: ',example_row[0].value)
print('ctype_text: ',ctype_text)
#合并标题
title_rows = zip(sheet.row_values(4),sheet.row_values(5))
titles = [t[0] + ' ' + t[1] for t in title_rows]
titles = [t.strip() for t in titles] #消除字符串开头结尾的空格
#为agate库创建类型列表
types = []
for v in example_row:
value_type = ctype_text[v.ctype]#指定ctype方法检查对象
if value_type == 'text':
types.append(text_type)
print('check_text_type: ',text_type)
elif value_type == 'number':
types.append(number_type)
print('check_number_type: ', number_type)
elif value_type == 'date':
types.append(date_type)
print('check_date_type: ', date_type)
else:
types.append(text_type)
print('check_text_type: ', text_type)
print ('types: ',types)
#该操作将遍历所有列,并返回列数据类型
#标题和类型打包
#将上述结果导入agate表中
country_rows = [sheet.row_values(r) for r in range (6,114)]
#table = agate.Table(country_rows,titles,types) 直接操作会报错,数据中的'-'会被当作空值处理
#清洗整数列中的'-'字符,确保整数列中拥有空数据,而不是'-'
def remove_bad_chars(val):
if val == '-':
return None
return val
cleaned_rows = []
for row in country_rows:
cleaned_row = [remove_bad_chars(rv) for rv in row]
cleaned_rows.append(cleand_row)
table = agate.Table(cleaned_rows,titles,types)
1、检查列名称
table.column_names
2、排序方法order_by
该方法为从小到大排序,reverse设置为True可从大到小排序
most_egregious = table.order_by(‘Total (%)’,reverse=True).limit(10)
#该方法创建新表
1.包含数字的列:
table.aggregate(agate.Sum(‘salary’)) #求和
table.aggregate(agate.Min(‘salary’)) #最小值
table.aggregate(agate.Max(‘salary’)) #最大值
table.aggregate(agate.Mean(‘salary’)) #均值
table.aggregate(agate.Median(‘salary’)) #中位数
table.aggregate(agate.Mode(‘salary’))
table.aggregate(agate.Variance(‘salary’)) #方差
table.aggregate(agate.StDev(‘salary’))
table.aggregate(agate.MAD(‘salary’))
多行操作方法:
table.aggregate([
agate.Min(‘salary’),
agate.Mean(‘salary’),
agate.Max(‘salary’)
])
2.汇总统计:
doctors = patients.group_by(‘doctor’)
patient_ages = doctors.aggregate([
(‘patient_count’, agate.Count()),
(‘age_mean’, agate.Mean(‘age’)),
(‘age_median’, agate.Median(‘age’))
])
连接方法:Table.group_by()
例:
doctors_by_state = patients.group_by(“state”).group_by(‘doctor’)
3.特殊方法:
列数据频率统计:
Table.pivot()
#计算一列中每个唯一值的数量
table.pivot(['列标题1', '列标题2'])
#计算值的分布占总数的百分比
table.pivot('doctor', computation=agate.Percent('Count'))
识别异常值:找出离群值
pip install agate-atat
标准差离群值
import agatestats
agatestats.patch()
outliers = table.stdev_outliers('salary', deviations=3, reject=False)
#deviations:标准差,标准差越大得到离群值越少
#reject=False告诉函数得到离群值,如设置为true将会得到没有离群值的数据
import xlrd
import agate
from xlrd.sheet import ctype_text
workbook = xlrd.open_workbook('G:/数据处理练习文件/data-wrangling-master/data/unicef/unicef_oct_2014.xls')
print(workbook.nsheets)
#sheet = workbook.sheets()[0]
#print('workbook.sheet_names: ',workbook.sheet_names())
#print('workbook.sheet: ',workbook.sheet_names()[0])
#print('type:',type(sheet))
#print('1:',sheet.row_values(6))
#print('6行1列:',sheet.row_values(6,0,1))
sheet = workbook.sheets()[0]#选择工作表
#print('number',sheet.nrows)
#print('title:',sheet.row_values(0))
#for i in range(sheet.nrows):
#print(i,sheet.row(i))
#数据类型
text_type = agate.Text()
number_type = agate.Number()
boolean_type = agate.Boolean()
data_type = agate.Date()
#测试
example_row = sheet.row(6)
print('example_row: ',example_row)
#print('slice方法: ',sheet.row_slice(6))
print('example_row[0].ctype: ',example_row[0].ctype)
print('example_row[0].value: ',example_row[0].value)
print('ctype_text: ',ctype_text)
#合并标题
title_rows = zip(sheet.row_values(4),sheet.row_values(5))
titles = [t[0] + ' ' + t[1] for t in title_rows]
titles = [t.strip() for t in titles] #消除字符串开头结尾的空格
#为agate库创建类型列表
types = []
for v in example_row:
value_type = ctype_text[v.ctype]#指定ctype方法检查对象
if value_type == 'text':
types.append(text_type)
print('check_text_type: ',text_type)
elif value_type == 'number':
types.append(number_type)
print('check_number_type: ', number_type)
elif value_type == 'date':
types.append(date_type)
print('check_date_type: ', date_type)
else:
types.append(text_type)
print('check_text_type: ', text_type)
print ('types: ',types)
#该操作将遍历所有列,并返回列数据类型
#标题和类型打包
#将上述结果导入agate表中
country_rows = [sheet.row_values(r) for r in range (6,114)]
#table = agate.Table(country_rows,titles,types) 直接操作会报错,数据中的'-'会被当作空值处理
#清洗整数列中的'-'字符,确保整数列中拥有空数据,而不是'-'
def remove_bad_chars(val):
if val == '-':
return None
return val
cleaned_rows = []
for row in country_rows:
cleaned_row = [remove_bad_chars(rv) for rv in row]
cleaned_rows.append(cleaned_row)
table = agate.Table(cleaned_rows,titles,types)
print(table.column_names)
print('最糟糕的十个国家:')
most_egregious = table.order_by('Total (%)',reverse=True).limit(10)
limit_ten_country = []
for r in most_egregious.rows:
limit_ten_country.append(r[0])
print(limit_ten_country)