Python数据分析:实用向( 三 )

画图画图准备解决中文符号显示问题plt.rcParams['font.sans-serif'] = ['SimHei']# 黑体plt.rcParams['axes.unicode_minus'] = False# 解决无法显示符号的问题sns.set(font='SimHei', font_scale=0.8)# 解决Seaborn中文显示问题设置背景样式plt.style.use('classic')plt.rc("figure", facecolor="white")#去除灰色边框绘图这是一个画箱线图代码import matplotlib.pyplot as pltsns.set_style('darkgrid')fig, ax = plt.subplots(figsize=(16,12),ncols=2)ax1 = sns.boxplot(x="Embarked", y="Fare", hue="Pclass", data=https://www.huyubaike.com/biancheng/train, ax = ax[0]);ax2 = sns.boxplot(x="Embarked", y="Fare", hue="Pclass", data=https://www.huyubaike.com/biancheng/test, ax = ax[1]);ax1.set_title("Training Set", fontsize = 18)ax2.set_title('Test Set',fontsize = 18)fig.show()画缺口饼图churn_value=https://www.huyubaike.com/biancheng/data['cvr_group_high'].value_counts()labels=data['cvr_group_high'].value_counts().indexplt.figure(figsize=(7,7))plt.pie(churn_value,labels=['一般客户', '高价值客户'],colors=["#75bbfd","#00ffff"], explode=(0.05,0),autopct='%1.1f%%', shadow=False)plt.rcParams['font.sans-serif']=['SimHei']plt.rcParams['axes.unicode_minus'] = Falseplt.title("高价值客户占比23.4%")#plt.savefig('pie.png', dpi=300)画相关性系数图mask = np.zeros_like(data.corr(), dtype=np.bool)#mask[np.triu_indices_from(mask)] = Trueplt.subplots(figsize = (15,12))sns.heatmap(data.corr(),annot=True,#mask = mask,cmap = 'RdBu', ## in order to reverse the bar replace "RdBu" with "RdBu_r"linewidths=.9,linecolor='gray',fmt='.2g',center = 0,square=True)plt.title("Correlations Among Features", y = 1.03,fontsize = 20, pad = 40) #相关性矩阵plt.savefig('cor.png', dpi=300)plt.show()画核密度估计fig = plt.figure(figsize=(15,8),)## I have included to different ways to code a plot behigh, choose the one that suites you.ax=sns.kdeplot(data.client[data.cvr_group_high == 0] ,color='gray',shade=True,label='high')ax=sns.kdeplot(data.loc[(data['cvr_group_high'] == 1),'client'] ,color='g',shade=True,label='high',)plt.title('client - high vs high', fontsize = 25, pad = 40)plt.ylabel("Frequency of cvr", fontsize = 15, labelpad = 20)plt.xlabel("Client", fontsize = 15,labelpad =20)## Converting xticks into words for better understandinglabels = ['H5', 'android', 'ios','pc','wap']plt.xticks(sorted(data.client.unique()), labels)plt.legend()模型训练导入模块#加载模块from sklearn.preprocessing import StandardScalerimport warningswarnings.filterwarnings("ignore") #过滤掉警告的意思from pyforest import *import pandas as pdimport numpy as npfrom sklearn.ensemble import RandomForestClassifier#随机森林from sklearn.svm import SVC,LinearSVC#支持向量机from sklearn.linear_model import LogisticRegression#逻辑回归from sklearn.neighbors import KNeighborsClassifier#KNN算法from sklearn.cluster import KMeans#K-Means 聚类算法from sklearn.naive_bayes import GaussianNB#朴素贝叶斯from sklearn.tree import DecisionTreeClassifier#决策树import xgboost as xgbfrom xgboost import XGBClassifierfrom catboost import CatBoostClassifierfrom sklearn.ensemble import AdaBoostClassifierfrom sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import classification_report,precision_score,recall_score,f1_score,accuracy_score #分类报告from sklearn.metrics import confusion_matrix#混淆矩阵from sklearn.metrics import silhouette_score#轮廓系数(评价k-mean聚类效果)from sklearn.model_selection import GridSearchCV#交叉验证from sklearn.metrics import make_scorerfrom sklearn.ensemble import VotingClassifier#投票def plot_predictions(test,predicted):#整体平移x=np.arange(0,len(test))+1#x[0]=1#my_x_ticks = np.arange(1, 14, 1)#plt.xticks(my_x_ticks)plt.plot(x,test,label='Real')plt.plot(x,predicted,color='darkOrange',linestyle='--',label='Predicted')#plt.xlabel('month')plt.ylabel('count')plt.legend()import mathdef mse_loss(y_true, y_pred):return np.sum(np.power(y_true - y_pred, 2)) / y_true.shape[0] / 2def return_rmse(test,predicted):rmse = math.sqrt(mse_loss(test, predicted))return rmse#print("The mean squared error is {}.".format(rmse))Classifiers=[["Random Forest",RandomForestClassifier()],["Support Vector Machine",SVC()],["LogisticRegression",LogisticRegression()],["KNN",KNeighborsClassifier(n_neighbors=5)],["Naive Bayes",GaussianNB()],["Decision Tree",DecisionTreeClassifier()],["AdaBoostClassifier",AdaBoostClassifier()],["GradientBoostingClassifier", GradientBoostingClassifier()],["XGB", XGBClassifier()],]设置训练集X=train.drop(['目标客户编号','品牌类型','购买意愿'], axis = 1)# X=train.drop(['目标客户编号','品牌类型'], axis = 1)t=Xheaders = X.columnsX= X.astype(float)y = train["购买意愿"]训练模型import warningswarnings.filterwarnings('ignore')Classify_result=[]names=[]prediction=[]for name,classifier in Classifiers:classifier=classifierclassifier.fit(X_train,y_train)y_pred=classifier.predict(X_test)recall=recall_score(y_test,y_pred,average='macro')precision=precision_score(y_test,y_pred,average='macro')f1score = f1_score(y_test, y_pred,average='macro')mse = return_rmse(y_test,y_pred)class_eva=pd.DataFrame([recall,precision,f1score,mse])Classify_result.append(class_eva)name=pd.Series(name)names.append(name)y_pred=pd.Series(y_pred)prediction.append(y_pred)plot_predictions(y_test,y_pred)## plt.savefig('seven1.png', dpi=300)plt.show()

推荐阅读