大家好,又见面了,我是全栈君。
# -*- coding: utf-8 -*-"""Created on Tue Feb 21 15:17:51 2017@author: zzpp220"""'''合并(merge)-列或者连接(concat)-行数据集'''from pandas import DataFrame,Seriesfrom numpy import nan as NaNimport pandas as pdimport numpy as npimport jsondf1=DataFrame({
'left':list('bbacaab'),'data1':range(7)})df2=DataFrame({
'right':list('abd'),'data2':range(3)})df12=pd.merge(df1,df2,left_on='left',right_on='right') ##分为内、外连接、像mysql 一样##重塑和轴向旋转 stack: change col to row 和unstack:change row to col'''这个好!!该有的都由了'''df3=DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['sd','gz'],name='state'),columns=pd.Index(['one','two','thr'],name='number'))'''多重索引的Series和DataFrame可以通过stack,unstack相互转哈,默认情况下取的都是最内层的索引'''##将列转为行,得到一个Series,索引是一个多重索引stack_df3=df3.stack()#对于一个层次化索引的Series ,可以用unstack 重排为DataFrame。stack_df3.unstack()'''df3.stack()Out[19]:state numbersd one 0two 1thr 2gz one 3two 4thr 5dtype: int64stack_df3.unstack()Out[22]:number one two thrstatesd 0 1 2gz 3 4 5unstack and stack operate on the in-most level-index,also can operate on other level when give other para'''stack_df3.unstack('state')==stack_df3.unstack(0)## operate on the out-most level the same as :''''stack_df3.unstack() 默认是按最内层也就是’number‘Out[22]:number one two thrstatesd 0 1 2gz 3 4 5stack_df3.unstack('state')==stack_df3.unstack(0)Out[25]:state sd gznumberone True Truetwo True Truethr True True''''''对全部的营养数据做分析'''db=json.load(open('/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch07/foods-2011-10-03.json'))nutrients=DataFrame(db[0]['nutrients'])'''nutrients[:10]Out[36]:description group units value0 Protein Composition g 25.181 Total lipid (fat) Composition g 29.202 Carbohydrate, by difference Composition g 3.063 Ash Other g 3.284 Energy Energy kcal 376.005 Water Composition g 39.286 Energy Energy kJ 1573.007 Fiber, total dietary Composition g 0.008 Calcium, Ca Elements mg 673.009 Iron, Fe Elements mg 0.64'''info_keys=['description','group','id','manufacturer']info=DataFrame(db,columns=info_keys)##仅仅导入db中的info_keys中的字段,其他的就不导入了;info[:5]'''查看食物类别的分布情况'''#info.group.value_counts()[:10]#查看group中的分布情况==pd.values_count(info.group)'''将所有的食物的营养成分整合到一个大表中'''nutrients=[]for rec in db:fnuts=DataFrame(rec['nutrients'])##将各食物的营养成分整合到一个大表fnuts['id']=rec['id']#并且在表中添加一个表示编号的列,用原来的表中的值赋值nutrients.append(fnuts)#依次将DataFrame添加到大的list中nutrients=pd.concat(nutrients,ignore_index=True)##用concat连接起来大表'''查看表中有多少重复项'''nutrients.duplicated().sum()'''丢弃重复项'''nutrients.drop_duplicates()'''重命名对象中的列名(因为二者有相同的列,可能会混淆)'''col_mapping={
'description':'food','group':'fgroup'}info=info.rename(columns=col_mapping,copy=False)'''info.rename(columns=col_mapping,copy=False)[:3]Out[50]:food fgroup id manufacturer0 Cheese, caraway Dairy and Egg Products 10081 Cheese, cheddar Dairy and Egg Products 10092 Cheese, edam Dairy and Egg Products 1018'''col_mapping={
'description':'nutrients','group':'nutgroup'}nutrients=nutrients.rename(columns=col_mapping,copy=False)'''合并info和nutrients'''ndata=pd.merge(nutrients,info,on='id',how='outer')'''根据营养分类,得出锌的中位值'''result=ndata.groupby(['nutrients','fgroup'])['value'].quantile(0.5)
转载于:https://www.cnblogs.com/zzxx-myblog/p/6481284.html
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/108645.html原文链接:https://javaforall.net
