agate学习笔记

司徒正信

2023-12-01

一、数据类型

text_type = agate.Text()
number_type = agate.Number()
boolean_type = agate.Boolean()
data_type = agate.Date()

数据类型测试方法 ctype
for v in example_row:
value_type = ctype_text[v.ctype]

二、agate(rows,titles,types)

例：

#测试
example_row = sheet.row(6)

print('example_row: ',example_row)
#print('slice方法: ',sheet.row_slice(6))
print('example_row[0].ctype: ',example_row[0].ctype)
print('example_row[0].value: ',example_row[0].value)

print('ctype_text: ',ctype_text)

#合并标题
title_rows = zip(sheet.row_values(4),sheet.row_values(5))
titles = [t[0] + ' ' + t[1] for t in title_rows]
titles = [t.strip() for t in titles]   #消除字符串开头结尾的空格

#为agate库创建类型列表
types = []

for v in example_row:
    value_type = ctype_text[v.ctype]#指定ctype方法检查对象
    if value_type == 'text':
        types.append(text_type)
        print('check_text_type: ',text_type)
    elif value_type == 'number':
        types.append(number_type)
        print('check_number_type: ', number_type)
    elif value_type == 'date':
        types.append(date_type)
        print('check_date_type: ', date_type)
    else:
        types.append(text_type)
        print('check_text_type: ', text_type)
print ('types: ',types)
#该操作将遍历所有列，并返回列数据类型
#标题和类型打包

#将上述结果导入agate表中
country_rows = [sheet.row_values(r) for r in range (6,114)]
#table = agate.Table(country_rows,titles,types) 直接操作会报错，数据中的'-'会被当作空值处理
#清洗整数列中的'-'字符，确保整数列中拥有空数据，而不是'-'
def remove_bad_chars(val):
    if val == '-':
        return None
    return val


cleaned_rows = []
for row in country_rows:
    cleaned_row = [remove_bad_chars(rv) for rv in row]
    cleaned_rows.append(cleand_row)

table = agate.Table(cleaned_rows,titles,types)

三、其他函数

1、检查列名称

table.column_names

2、排序方法order_by
该方法为从小到大排序，reverse设置为True可从大到小排序

most_egregious = table.order_by(‘Total (%)’,reverse=True).limit(10)
#该方法创建新表

四、统计学函数

1.包含数字的列：

table.aggregate(agate.Sum(‘salary’)) #求和
table.aggregate(agate.Min(‘salary’)) #最小值
table.aggregate(agate.Max(‘salary’)) #最大值
table.aggregate(agate.Mean(‘salary’)) #均值
table.aggregate(agate.Median(‘salary’)) #中位数
table.aggregate(agate.Mode(‘salary’))
table.aggregate(agate.Variance(‘salary’)) #方差
table.aggregate(agate.StDev(‘salary’))
table.aggregate(agate.MAD(‘salary’))

多行操作方法：

table.aggregate([
agate.Min(‘salary’),
agate.Mean(‘salary’),
agate.Max(‘salary’)
])

2.汇总统计：

doctors = patients.group_by(‘doctor’)
patient_ages = doctors.aggregate([
(‘patient_count’, agate.Count()),
(‘age_mean’, agate.Mean(‘age’)),
(‘age_median’, agate.Median(‘age’))
])
连接方法：Table.group_by()
例：
doctors_by_state = patients.group_by(“state”).group_by(‘doctor’)

3.特殊方法：

列数据频率统计：

Table.pivot()  
#计算一列中每个唯一值的数量
table.pivot(['列标题1', '列标题2'])
#计算值的分布占总数的百分比
table.pivot('doctor', computation=agate.Percent('Count'))

识别异常值：找出离群值

pip install agate-atat

标准差离群值

import agatestats
agatestats.patch()
outliers = table.stdev_outliers('salary', deviations=3, reject=False)  
#deviations：标准差，标准差越大得到离群值越少
#reject=False告诉函数得到离群值，如设置为true将会得到没有离群值的数据

五、练习代码

import xlrd
import agate
from xlrd.sheet import ctype_text

workbook = xlrd.open_workbook('G:/数据处理练习文件/data-wrangling-master/data/unicef/unicef_oct_2014.xls')

print(workbook.nsheets)

#sheet = workbook.sheets()[0]
#print('workbook.sheet_names: ',workbook.sheet_names())
#print('workbook.sheet: ',workbook.sheet_names()[0])
#print('type:',type(sheet))
#print('1:',sheet.row_values(6))
#print('6行1列：',sheet.row_values(6,0,1))
sheet = workbook.sheets()[0]#选择工作表

#print('number',sheet.nrows)
#print('title:',sheet.row_values(0))
#for i in range(sheet.nrows):
    #print(i,sheet.row(i))

#数据类型
text_type = agate.Text()
number_type = agate.Number()
boolean_type = agate.Boolean()
data_type = agate.Date()

#测试
example_row = sheet.row(6)

print('example_row: ',example_row)
#print('slice方法: ',sheet.row_slice(6))
print('example_row[0].ctype: ',example_row[0].ctype)
print('example_row[0].value: ',example_row[0].value)

print('ctype_text: ',ctype_text)

#合并标题
title_rows = zip(sheet.row_values(4),sheet.row_values(5))
titles = [t[0] + ' ' + t[1] for t in title_rows]
titles = [t.strip() for t in titles]   #消除字符串开头结尾的空格

#为agate库创建类型列表
types = []

for v in example_row:
    value_type = ctype_text[v.ctype]#指定ctype方法检查对象
    if value_type == 'text':
        types.append(text_type)
        print('check_text_type: ',text_type)
    elif value_type == 'number':
        types.append(number_type)
        print('check_number_type: ', number_type)
    elif value_type == 'date':
        types.append(date_type)
        print('check_date_type: ', date_type)
    else:
        types.append(text_type)
        print('check_text_type: ', text_type)
print ('types: ',types)
#该操作将遍历所有列，并返回列数据类型
#标题和类型打包

#将上述结果导入agate表中
country_rows = [sheet.row_values(r) for r in range (6,114)]
#table = agate.Table(country_rows,titles,types) 直接操作会报错，数据中的'-'会被当作空值处理
#清洗整数列中的'-'字符，确保整数列中拥有空数据，而不是'-'
def remove_bad_chars(val):
    if val == '-':
        return None
    return val


cleaned_rows = []
for row in country_rows:
    cleaned_row = [remove_bad_chars(rv) for rv in row]
    cleaned_rows.append(cleaned_row)

table = agate.Table(cleaned_rows,titles,types)

print(table.column_names)
print('最糟糕的十个国家：')
most_egregious = table.order_by('Total (%)',reverse=True).limit(10)
limit_ten_country = []
for r in most_egregious.rows:
    limit_ten_country.append(r[0])
print(limit_ten_country)

agate学习笔记

一、数据类型

二、agate(rows,titles,types)

三、其他函数

四、统计学函数

五、练习代码

相关阅读

相关文章

相关问答

相关文档