pandas 基础API

夹谷英杰
2023-12-01
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

'''
Series
创建pd.Series(data=None, index=None, dtype=None)
'''
# s1 = pd.Series(np.arange(1, 10, 1))
# print(s1.values)  # 数据
# print(s1.index)  # 索引
# print(s1.index.values)  # 索引数据
# color_count = pd.Series({'red': 100, 'blue': 200, 'green': 500, 'yellow': 1000})
# print(color_count['red'])  # 使用索引来获取数据

'''
DataFrame
创建pd.DataFrame(data=None, index=None, columns=None)
index：行标签 axis=0
columns：列标签 axis=1
'''

score = np.random.randint(50, 100, (10, 5))
score_df = pd.DataFrame(score)
# print(score_df)
# print(score_df.shape)  # (10,5) m行 n列
# stu = ['同学' + str(i) for i in range(score_df.shape[0])]
# score_df.index = stu # 设置行索引
# score_df.columns = ["语文", "数学", "英语", "政治", "体育"]
# print(score_df)
# print(score_df)
subjects = ["语文", "数学", "英语", "政治", "体育"]
stu = ['stu' + str(i) for i in range(score_df.shape[0])]
data = pd.DataFrame(score, columns=subjects, index=stu)
# print(data)
# print(data.columns.get_indexer(["体育", "政治"]))
# print(data['语文'])
# print(data['语文']['同学0'])
# print(data.columns)
# print(data.index)
# print(data.values)
# print(data.T) # 转置
# data.head(5) # 显示前5行内容
# data.tail(5) # 显示后5行内容
# print(data.sort_values(by="数学", ascending=False))
# print(data.sort_values(by=["数学", "语文"], ascending=False))
# print(data['数学'] > 80)
# print(data[data['数学'] > 80])  # 得到数学大于80的
# print(data[(data['数学'] > 65) & (data['语文'] < 85)])
# print(data.query('数学>65 & 语文<85'))
# print(data['数学'].isin([60, 70, 80]))
# print(data[data['数学'].isin([60, 70, 80])])

'''
用统计函数：0 代表列求结果， 1 代表行求统计结果
max()、min()
std()：标准差
var()：方差
median()：中位数
idxmax()：求出最大值的位置
idxmin()：求出最小值的位置
cumsum(): 累计和
cummax(): 当前最大值
cummin(): 当前最小值
cumprod(): 累积
apply() : 自定义函数
'''
# print(data.describe())
# print(data.max(axis=0))  # axis=0 列 即每个学科最高分
# print(data.max(axis=1))  # axis=1 行 即每个同学最高分
# print(data[['数学', '英语']].apply(lambda x: x.max() - x.min(), axis=0))  # 分别得到数学和英语最大值最小值的分差
# math = data['数学']
# print(math.cummax())

'''
Pandas画图
pd.DataFrame.plot(kind='line')
line : 折线图
bar : 条形图
barh : 横放的条形图
hist : 直方图
pie : 饼图
scatter: 散点图
kind : str，需要绘制图形的种类
'''
# math.plot()
# plt.show()

'''
文件读取与存储
pd.read_csv(filepath_or_buffer, sep =',', usecols)
    filepath_or_buffer:文件路径
    sep :分隔符，默认用","隔开
    usecols:指定读取的列名，列表形式
    
DataFrame.to_csv(path_or_buf=None, sep=', ’, columns=None, header=True, index=True, mode='w', encoding=None)
    path_or_buf :文件路径
    sep :分隔符，默认用","隔开
    columns :选择需要的列索引
    header :boolean or list of string, default True,是否写进列索引值
    index:是否写进行索引
    mode:‘w’：重写, ‘a’ 追加
    
pd.read_json(path_or_buf=None, orient=None, typ='frame', lines=False)
    path_or_buf : 路径
    orient : string,以什么样的格式显示.下面是5种格式：
        1.split 将索引总结到索引，列名到列名，数据到数据。将三部分都分开了
        2.records 以columns：values的形式输出
        3.index 以index：{columns：values}…的形式输出
        4.columns 以columns:{index:values}的形式输出
        5.values 直接输出值
    lines : boolean, default False
    typ : default ‘frame’， 指定转换成的对象类型series或者dataframe
DataFrame.to_json(path_or_buf=None, orient=None, lines=False) 将Pandas 对象存储为json格式
    path_or_buf=None：文件地址
    orient:存储的json形式，{‘split’,’records’,’index’,’columns’,’values’}
    lines:一个对象存储为一行
'''
# data.to_csv("data_v1.csv", header=True, index=True)
# print(pd.read_csv("data_v1.csv"))
# json_df = pd.read_json("1.json", orient="records", lines=True)
# print(json_df)
# print(pd.isnull(data))  # 判断是否是缺失值，是则返回False
# print(np.all(pd.isnull(data)))  # np.all()只要有一个就返回False
#
# print(pd.notnull(data))  # 判断是否是缺失值，是则返回True
# print(np.all(pd.notnull(data)))  # np.all()只要有一个就返回Ture

'''
数据离散化
'''
# 自行分组
# qcut = pd.qcut(data['数学'], 10)
# 计算分到每个组数据个数
# print(qcut)
# print(qcut['stu0'])  # 学生1在哪个区间
# print(qcut.value_counts())  # 统计每个分组中有多少数据

# 自定义区间分组
# bins = [50, 60, 70, 80, 90, 100]
# p_counts = pd.cut(data['数学'], bins)
# print(p_counts.value_counts())

# 分组与聚合
col = pd.DataFrame({'color': ['white', 'red', 'green', 'red', 'green'],
                    'object': ['pen', 'pencil', 'pencil', 'ashtray', 'pen'],
                    'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
                    'price2': [4.75, 4.12, 1.60, 0.75, 3.15]})
print(col)
print(col.groupby(['color'])['price1'].mean())  # 按color分组，再取出price1列求平均值
print(col['price1'].groupby(col['color']).mean())  # 按color分组，再取出price1列求平均值
print(col.groupby(['color'], as_index=False)['price1'].mean())  # 分组，数据的结构不变

col.groupby(['color'])['object'].count().plot(kind='bar')
plt.show()

df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale': [55, 40, 84, 31]})
# print(df)
# df = df.set_index('month')
# df = df.set_index(['month', 'year'])
# print(df)
# print(df.index)
# print(df.index.names)
# print(df.index.levels)


# csv_df = pd.read_csv("data.csv")
# print(csv_df)
# print(csv_df.loc[1:2, 'score'])
pandas 基础API

相关阅读

相关文章

相关问答

相关文档