Python数据分析:实用向

文件处理导包import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inline

添加镜像
https://mirrors.tuna.tsinghua.edu.cn/https://developer.aliyun.com/mirror/http://mirrors.163.com/ubuntu/https://mirrors.ustc.edu.cn/http://mirrors.zju.edu.cn/http://mirrors.sohu.com/http://ftp.sjtu.edu.cn/http://mirror.bjtu.edu.cn/http://mirror.bjtu.edu.cn/
语法
其中httphttps是可选的
! pip install xxx -i https://mirrors.tuna.tsinghua.edu.cn/导入文件exceldata=https://www.huyubaike.com/biancheng/pd.read_excel(r"C:\Users\ranxi\Desktop\附录1 目标客户体验数据.xlsx", sheet_name='data')data.head()csvdata=https://www.huyubaike.com/biancheng/pd.read_csv()EDA报告#生成报告import pandas_profilingdata.profile_report()#输出报告文件pfr = pandas_profiling.ProfileReport(data)pfr.to_file('report.html')dataframe导出excel文件data.to_excel('data.xlsx')数据处理数据筛选分类均值展示cvr_summary = data.groupby("cvr_group_high")cvr_summary.mean().reset_index()标签编码print("client","--" ,data.client.unique())from sklearn.preprocessing import LabelEncoderdata.client = LabelEncoder().fit_transform(data.client)print("client","--" ,data.client.unique())交叉比例表pd.crosstab(data['invited_is'],data["cvr_group_high"],normalize=0)计算分布比例def percent_value_counts(df, feature):"""This function takes in a dataframe and a column and finds the percentage of the value_counts"""percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))## creating a df with thtotal = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))## concating percent and total dataframetotal.columns = ["Total"]percent.columns = ['Percent']return pd.concat([total, percent], axis = 1)percent_value_counts(data, "B7")多列apply函数with_N['B7'] = with_N.apply(lambda x: child_estimator(x['B6'], x['B5']), axis=1)卡方检验#分组间确实是有显著性差异 , 频数比较的结论才有可信度,故需进行”卡方检验“from scipy.stats import chi2_contingency#统计分析 卡方检验#自定义卡方检验函数def KF(x):df1=pd.crosstab(data2['购买意愿'],data2[x])li1=list(df1.iloc[0,:])li2=list(df1.iloc[1,:])kf_data=https://www.huyubaike.com/biancheng/np.array([li1,li2])kf=chi2_contingency(kf_data)if kf[1]<0.05:print('购买意愿 by {} 的卡方临界值是{:.2f},小于0.05,表明{}组间有显著性差异,可进行【交叉分析】'.format(x,kf[1],x),'\n')else:print('购买意愿 by {} 的卡方临界值是{:.2f} , 大于0.05,表明{}组间无显著性差异,不可进行交叉分析'.format(x,kf[1],x),'\n')#对 kf_var进行卡方检验print('kf_var的卡方检验结果如下:','\n')print(list(map(KF, kf_var)))条件筛选specific=data[(data['a1']>100)|(data['a2']>100)|(data['a3']>100)|(data['a4']>100)|(data['a5']>100)|(data['a6']>100)|(data['a7']>100)|(data['a8']>100)]specificspecific=data[(data['']>x)|&()]data[data.Cabin=='N']map函数分组def hour_group_fun(hour):x = ''if 0<=hour<8:x=1elif 8<=hour<16:x=2else:x=3return x## Applying function to the column.police['hour_group'] =police['hour'].map(hour_group_fun)apply多列赋值with_N['B7'] = with_N.apply(lambda x: child_estimator(x['B6'], x['B5']), axis=1)这是一个分布比例函数def percent_value_counts(df, feature):"""This function takes in a dataframe and a column and finds the percentage of the value_counts"""percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))## creating a df with thtotal = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))## concating percent and total dataframetotal.columns = ["Total"]percent.columns = ['Percent']return pd.concat([total, percent], axis = 1)特征工程时间数据处理police['date'] = pd.to_datetime(police['接警日期'],errors='coerce')police['year'] =police['date'].dt.year.fillna(0).astype("int")#转化提取年police['month'] = police['date'].dt.month.fillna(0).astype("int")#转化提取月police['day'] = police['date'].dt.day.fillna(0).astype("int")#转化提取天police['dates'] = police['month'].map(str) + '-' + police['day'].map(str) #转化获取月-日police['time'] = pd.to_datetime(police['接警时间点'],errors='coerce').dt.timepolice['hour'] = pd.to_datetime(police['接警时间点'],errors='coerce').dt.hour.fillna(0).astype("int")#转化提取小时SMOTE过抽样from imblearn.over_sampling import SMOTEmodel_smote=SMOTE()X,y=model_smote.fit_resample(X,y)X=pd.DataFrame(X,columns=t.columns)#分拆数据集:训练集 和 测试集X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)print('过抽样数据特征:', X.shape,'训练数据特征:',X_train.shape,'测试数据特征:',X_test.shape)print('过抽样后数据标签:', y.shape,'训练数据标签:',y_train.shape,'测试数据标签:',y_test.shape)

推荐阅读