python数据处理

文章目录dataframeexcel读写数据Pandas不覆盖现有sheet在Excel中写入数据Pandas读取Excel的不同sheet中的数据read_excel()openpyxlRF一个例子dataframe#文件(示例，data数据框并不存在)import pandas as pda=pd.DataFrame(data)a.head(6)a.describe()a.T...

sljwy

254人浏览 · 2019-10-14 00:12:31

sljwy · 2019-10-14 00:12:31 发布

文章目录

dataframe
excel读写数据
- Pandas不覆盖现有sheet在Excel中写入数据
- Pandas读取Excel的不同sheet中的数据
read_excel()
openpyxl
RF一个例子

dataframe

#文件(示例，data数据框并不存在)
import pandas as pd
a=pd.DataFrame(data)
a.head(6)
a.describe()
a.T  ###转置
a.sort_index(axis=1,asceding=True)
a['x']
a[0:3]
a.loc[:,['a','b']] ##通过标签来索引
a.iloc[1:3,1:6]  ###通过位置来索引
a.iloc[1:2] 
a.iloc[[0,2,3],[1,2,7]]  ##选取任意的行和列
a.loc[:,1:3]=7  ##直接赋值或者改变内容

excel读写数据

Pandas不覆盖现有sheet在Excel中写入数据

A = np.array([[1,2,3],[4,5,6]])
df = pd.DataFrame(A)
df.to_excel('test_excel.xlsx',sheet_name='A')

但是，如果需要把两个DataFrame数据写入Excel文件中的不同sheet中，使用这种方法就有问题了：

A = np.array([[1,2,3],[4,5,6]])
B = np.array([[10, 20, 30], [40, 50, 60]])

df1 = pd.DataFrame(A)
df2 = pd.DataFrame(B)
df1.to_excel('test_excel.xlsx',sheet_name='AAA')
df2.to_excel('test_excel.xlsx',sheet_name='BBB')

执行以上程序之后，打开 “test_excel.xlsx” ，可以看到表格中只有名字为“BBB”的sheet保存下来了，而名字为“AAA”的sheet被覆盖掉了。
其实被覆盖的原因很好理解，程序在执行第二条写入语句的时候，默认以前的数据是没有用的，先清空这个Excel文件里的数据。
解决方法：
利用Pandas包中的ExcelWriter()方法增加一个公共句柄，在写入新的数据之时保留原来写入的数据，等到把所有的数据都写进去之后关闭这个句柄。示例如下：

writer = pd.ExcelWriter('test_excel.xlsx')
A = np.array([[1,2,3],[4,5,6]])
B = np.array([[10, 20, 30], [40, 50, 60]])

df1 = pd.DataFrame(A)
df2 = pd.DataFrame(B)
df1.to_excel(writer,sheet_name='AAA')
df2.to_excel(writer,sheet_name='BBB')
writer.close()

Pandas读取Excel的不同sheet中的数据

在读取有多个sheet的Excel时，如果不指定sheet名字，那么read_excel 函数默认读取第一个sheet中的数据。例如，我们使用上面生成的“test_excel.xlsx”表格来进行测试

d1 = pd.read_excel('test_excel.xlsx')  #默认属性下读取
print("d1:\n",d1)
d2 = pd.read_excel('test_excel.xlsx',sheet_name = 'AAA')  #指定sheet名读取
print("dd1:\n",d2)
d3 = pd.read_excel('test_excel.xlsx',sheet_name = 'BBB')  #指定sheet名读取
print("d3:\n",d3)

那么，如果我们想一条代码读取excel中的所有值时，可以将“sheet_name”属性指定为None，这样会得到一个字典变量，字典的key就是sheet名，value就是对应sheet里的数据：

    dd1 = pd.read_excel('test_excel.xlsx')
    print("dd1:\n",dd1)
    dd2 = pd.read_excel('test_excel.xlsx',None)
    print("dd2:\n",dd2)

read_excel()

#读取数据之后原始数据第一行会变为标签，第二行变为第一行
pd.read_excel(io, sheet_name=0, header=0, names=None, index_col=None, 
              usecols=None, squeeze=False,dtype=None, engine=None, 
              converters=None, true_values=None, false_values=None, 
              skiprows=None, nrows=None, na_values=None,
              parse_dates=False, 
              date_parser=None, thousands=None, comment=None, skipfooter=0, 
              convert_float=True, **kwds)

"""
io:excel的存储路径
sheet_name:要读取的工作表名称
header:用哪一行作列名
names:自定义最终的列名
index_col:用作索引的列
usecols:需要读取哪些列
converters:强行规定列数据类型
skiprows:跳过特定行
nrows:需要读取的行数
skipfooter:跳过末尾n行
"""

io=r'C:\Users\Administrator\Documents\WeChat Files\data'
sheet_name='中英文名都可以或者Sheet1' #S要大写,默认0(即Sheet1)
names=['a','b','c','d'] ##命名列
usecols=[0,1,2]
usecols='A:C,E' ##包括C
converters= {'排名':str,'场次':int}  ##默认全部是整型

openpyxl

## 创建xlsx
from openpyxl import Workbook
wb = Workbook() ## 实例化，对excel数据表的操作在wb上
ws = wb.active ##激活sheet

## 读取xlsx
load_workbook(filename,read_only=False, use_iterators=False,  
              keep_vba=False, guess_types=False,  
              data_only=False) 
from openpyxl import load_workbook
wb = load_workbook('文件名.xlsx') ## 默认只读第一个sheet

##一个例子
def read_excel_xlsx(path, sheet_name):
    workbook = openpyxl.load_workbook(path)
    # sheet = wb.get_sheet_by_name(sheet_name)这种方式已经弃用，不建议使用
    sheet = workbook[sheet_name]
    for row in sheet.rows:
        for cell in row:
            print(cell.value, "\t", end="")
        print()

## 创建新的sheet
help(openpyxl.workbook.Workbook.create_sheet)
create_sheet(self, title=None, index=None)

##一个例子
def save(data1,K,path):
	f = load_workbook(path)  # 创建工作簿
	sheet1 = f.create_sheet(title=para_phase, index=0)  # 创建sheet
	L = len(data1)
	for j in range(0,L):
		sheet1.cell(j+1,1).value = float(data1[j])
	f.save(path) ## ！！！！重要！！！必须保存！！！！

RF一个例子

import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import xlwt
import sys
import datetime
from openpyxl import load_workbook
from imblearn.over_sampling import SMOTE


global para_phase
para_phase = "phase11" #phase1，phase2，phase3。。。。。。。。。。 其中phase1 为未删特征版本

global balance
balance = "TRUE"  #TRUE/FALSE


def save(data1,data2,data3,data4,data5,data6,data7,data8,path):
	f = load_workbook(path)  # 创建工作簿
	sheet1 = f.create_sheet(title='model_%s'%(para_phase), index=0)  # 创建sheet
	L = len(data1)  # h为行数，l为列数
	for j in range(0,L):
		sheet1['A1'].value = 'TN'
		sheet1['A2'].value = 'FP'
		sheet1['A3'].value = 'FN'
		sheet1['A4'].value = 'TP'
		sheet1['A5'].value = 'accuary rate'
		sheet1['A6'].value = 'OOB Score'
		sheet1['A7'].value = 'KS'
		sheet1['A8'].value = 'AUC'
		sheet1.cell(1,j+2).value = int(data1[j])
		sheet1.cell(2,j+2).value = int(data2[j])
		sheet1.cell(3,j+2).value = int(data3[j])
		sheet1.cell(4,j+2).value = int(data4[j])
		sheet1.cell(5,j+2).value = float(data5[j])
		sheet1.cell(6,j+2).value = float(data6[j])
		sheet1.cell(7,j+2).value = float(data7[j])
		sheet1.cell(8,j+2).value = float(data8[j])


	f.save(path)



if __name__ == '__main__':
	
	all_data = pd.read_excel('TRAIN_all_%s.xlsx'%(para_phase),'Sheet1')

	train_array = all_data.values

	train_row = np.size(train_array,0)
	train_col = np.size(train_array,1)

	train_data =train_array[0:train_row,3:train_col] 
	train_re = train_array[0:train_row,2] 

	# train_re = train_re.astype(numpy.int)

	kf = KFold(n_splits=10,shuffle=True)

	predict_score = []
	oob_score = []
	KS = []
	AUC_A = []
	TN = []
	FP = []
	FN = []
	TP = []

	for train_index,test_index in kf.split(train_re):
		#k += 1
		X_train,X_test = train_data[train_index],train_data[test_index] #数据
		Y_train,Y_test = train_re[train_index],train_re[test_index]

		if balance == 'TRUE':
			over_sample = SMOTE(random_state = 255)
			over_sample_X,oversample_Y = over_sample.fit_sample(X_train,Y_train)
		else:
			over_sample_X = X_train
			oversample_Y = Y_train

		model = RandomForestRegressor(n_estimators=500,oob_score=True)
		re_fit = model.fit(over_sample_X,oversample_Y)
		re_pred = re_fit.predict(X_test)

		re_pred_A = re_pred
		re_pred[re_pred>=0.3] = 1
		re_pred[re_pred<0.3] = 0
		re_pred = re_pred.tolist()
		Y_test = Y_test.tolist()

		tn1, fp1, fn1, tp1 = confusion_matrix(Y_test, re_pred).ravel()

		accuary = (float(tn1)+float(tp1))/(float(tn1)+float(fp1)+float(fn1)+float(tp1))
		oobscore = model.oob_score_
		K_S = float(tp1)/(float(tp1)+float(fn1))-float(fp1)/(float(fp1)+float(tn1))
		AUC = roc_auc_score(Y_test, re_pred_A)
		
		print(accuary)
		print(oobscore)
		print(K_S)
		print(AUC)

		predict_score.append(accuary)
		oob_score.append(oobscore)
		KS.append(K_S)
		AUC_A.append(AUC)
		TN.append(tn1)
		FP.append(fp1)
		FN.append(fn1)
		TP.append(tp1)

	save(TN,FP,FN,TP,predict_score,oob_score,KS,AUC_A,'para_phase_%s.xlsx'%(balance))
	print ("success!")