RNote113---smbinning分箱并输出到Excel

网友投稿 627 2022-10-08

RNote113---smbinning分箱并输出到Excel

RNote113---smbinning分箱并输出到Excel

用IV做单变量分析,需要关注每个bin的覆盖率和逾期率。把smbinning的结果和图输出到指定的Excel中。

统计分析

# ------------------------------***统计分析函数***------------------------------## parameter : x-feature,y-label,data-dataframe,## 参数:## ---- x:变量名 ## ---- y:标签名 ## ---- data:数据框名## ---- type:## ---------descriptive_statistics:统计数据,包括缺失值个数等## ---------fillnull:缺失值填补## ---------fillpara:缺失值填补的参数## -----------------newname:填补之后的新特征## -----------------inplace:是否在原数据框增加这一列## -----------------fillvalue:缺失值填补项## ---------confusion_matrix:输出混淆矩阵## ---------ispercent:是否输出%格式的univariate_analyze_f <- function(x,y,data,ispercent = TRUE, type = "descriptive_statistics", fillpara = list(newname = paste0(x, "_new"), inplace = FALSE,fillvalue = -1)) { ## 加载需要的包 tryCatch( expr = {library(gmodels)}, error = function(e) {print("You Need Install 'gmodels' Packages First");return()} ) tryCatch( expr = {library(reshape2)}, error = function(e){print("You Need Install 'reshape2' Packages First");return()} ) ## 缺失值填充的配置参数 newname <- fillpara$newname inplace <- fillpara$inplace fillvalue <- fillpara$fillvalue if (type == "descriptive_statistics") { #make.names = NA 行名自动化,即1,2,3 descriptive_data <- as.data.frame.array(summary(data[x]), make.names = NA) return(descriptive_data) } else if (type == 'fillnull') { if (sum(is.na(data[x])) > 0) { # 有缺失值 if (inplace == TRUE) { data[newname] <- lapply(data[x], function(x) {ifelse(is.na(x), fillvalue, x)})[[1]] return(data) } else{ newdata <- data.frame(newname = lapply(data[x], function(x){ifelse(is.na(x), fillvalue, x)})[[1]]) colnames(newdata) <- newname return(newdata) } } else{print("No Miss Data")} } else if (type == 'confusion_matrix') { xx <- gmodels::CrossTable( data[, x], data[, y], prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, dnn = c("Feature", "Label") ) dt1 <- reshape2::dcast(as.data.frame(xx$t,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL) colnames(dt1) <- c("Feature/Lable",paste0("Freq_",colnames(dt1)[2:ncol(dt1)])) dt2 <- reshape2::dcast(as.data.frame(xx$prop.row,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL) colnames(dt2) <- c("Feature/Lable",paste0("Rate_",colnames(dt2)[2:ncol(dt2)])) dt <- merge(dt1,dt2,by = "Feature/Lable") dt3 <- reshape2::dcast(as.data.frame(xx$prop.tbl,stringsAsFactors = FALSE),x~y,fun.aggregate = NULL) dt["Row Total"] <- apply(dt1[2:ncol(dt1)], MARGIN = 1, FUN = sum) dt["Row_Rate"] <- apply(dt3[2:ncol(dt3)], MARGIN = 1, FUN = sum) ## 按照列求和,有点问题。Rate_0+Rate_1=0 Col_Total_Vector <- c(NA,apply(dt[2:ncol(dt)],MARGIN = 2, FUN = sum)) names(Col_Total_Vector)[1] <- "Feature/Lable" Col_Total_DF <- as.data.frame(matrix(Col_Total_Vector,1,length(Col_Total_Vector),byrow = T),stringsAsFactors = FALSE) colnames(Col_Total_DF) <- names(Col_Total_Vector) dt <- rbind(dt,Col_Total_DF);dt[nrow(dt),1] <- "Column Total" ## 处理 Rate_0、Rate_1 rate_len <- length(colnames(dt2)) for (i in colnames(dt2)[2:rate_len]){ dt[nrow(dt),i] <- dt[nrow(dt),paste0("Freq_",substr(i,6,nchar(i)))]/dt[nrow(dt),"Row Total"] } for(i in 2:ncol(dt)){dt[i] <- round(dt[i],4)} ## 是否输出百分号形式的数据 if(ispercent){ percent_col <- names(which(apply(dt[2:ncol(dt)],2,function(x){any(x>0&x<1)}))) for(i in percent_col ){dt[i] <- paste0(dt[,i]*100,"%")} return(dt) }else{return(dt)} } }

分箱

# ------------------------------***分箱函数***------------------------------## this function can return information value#### parameter seq_bin is like 10,20,30,so i can split by ","#### parameter maxcat is the maximum number of categories ## 参数:## ---- x:变量名,不要因子,不要因子,不要因子 ## ---- y:标签名 ## ---- data:数据框名## ---- seq_bin:分箱间隔## ---------auto;算法自动分箱## ---------"10,20":类似这种格式,程序处理为{(-inf,10](10,20](20,+inf]}三组## ---------type_level:重新分组,输出结果## ----maxcat:最大类别型变量个数,默认10,如果x的unique超过10,会报错## -----------------newname:填补之后的新特征univariate_iv_f <- function(x, y, data, seq_bin = "auto",maxcat = 10,type_level = list()) { tryCatch( expr = {library(smbinning)}, error = function(e) { print("You Need Install 'smbinning' Packages First") install.packages("smbinning");library(smbinning) } ) if (is.character(data[, x])|is.factor(data[, x])) { ## 类别型变量 if(length(type_level)==0){ ## 自动分箱,即不会合并类别 data[, x] <- as.factor(data[, x]) return(smbinning.factor(df = data,y = y, x = x, maxcat = maxcat)) }else{ ## 按照分类合并数据 for (i in names(type_level)){assign(i,type_level[[i]])} ## 循环完成赋值操作 ## 下面完成分类操作,又要用循环。简直丧心病狂 x_new <- c() for(j in data[, x] ){ isin <- unlist(lapply(names(type_level),function(x){j %in% get0(x)})) if(sum(isin)==0){ ## 如果不在list中,统一标识为"others" newname <- "others" }else if(sum(isin)>1){ ## 说明type_level设置有误,存在交叉项 print("Error: type_level have repetition ") return("Error: type_level have repetition ") }else{newname <- names(type_level)[which(isin)]} x_new <- c(x_new,newname) } x_colnew <- paste0(x,"_new") data[x_colnew] <- x_new data[, x_colnew] <- as.factor(data[, x_colnew]) return(smbinning.factor(df = data,y = y, x = x_colnew, maxcat = maxcat)) } } else if (is.numeric(data[, x])) { ## 数值型变量 if (seq_bin == "auto") { ## 算法自动分箱 return(smbinning( df = data,y = y, x = x,p = 0.05 )) } else{ ## 人为输入间隔点,分箱 cuts <- as.numeric(unlist(stringr::str_split(seq_bin, ","))) return(smbinning.custom(df = data,y = y,x = x,cuts = cuts)) } }}

画图

# ------------------------------***画图函数***------------------------------## ## 参数:## ---- x:变量名,不要因子,不要因子,不要因子,type = "iv",x,y有默认值不需要改 ## ---- y:标签名 ## ---- data:数据框名,要求传入的data是 smbinning 返回的结果## ---- type:两个类型## ----------distribution:连续值的密度分布## ----------iv:分箱之后的数据进行画图## ----------picname:特征,图片命名所用## ---- overduerate:平均逾期率,需要添加在图片中univariate_pic_f <- function(x="GoodRate",y="Cutpoint",picname = "Feature", data,type = "distribution",overduerate = 0.5) { tryCatch( expr = {library(ggplot2)}, error = function(e) { print("You Need Install 'ggplot2' Packages First") install.packages("ggplot2");library(ggplot2) } ) if (type == "distribution") { if(is.numeric(data[, x])){ # 如果是数值型,那么输出密度分布图 ## use ggplot2 generate distribution figure of the density ## this is just for continuous variable plotdata <- data[, c(x, y)];plotdata[, y] <- as.factor(plotdata[, y]) colnames(plotdata) <- c("Feature", "Label") library(ggplot2) pic <- ggplot2::ggplot(data = plotdata, aes(x = Feature,color = Label,fill = Label)) + geom_density(alpha = .3)+scale_fill_brewer(type = "seq", palette = "Greens") + scale_colour_manual(values = c("red", "yellow")) + labs(title = paste0(stringr::str_to_title(x)," Distribution Of Different Groups")) return(pic) }else{ return("Input Must Be Continuous Variable") } }else if(type == "iv"){ ## 输入的是分箱之后的结果,需要画出分组逾期率和覆盖率,并且添加平均逾期率 library(ggplot2) mytheme <- theme( plot.title = element_text( size = 15,hjust =0.5, vjust = 1,color="black", face = "bold"), #改变标题的位置、颜色、字体大小 panel.background=element_rect(fill="white", color="black"), panel.grid.major.y=element_line(color="black", linetype=2), panel.grid.minor.y=element_blank(), panel.grid.minor.x=element_blank() ) plotdata <- data[1:(nrow(data)-2),1:ncol(data)] pic <- ggplot(data = plotdata, aes(x = Cutpoint, y = GoodRate, group = 1)) + geom_bar(mapping = aes(y = PctRec), position = "dodge",stat = "identity", width = 0.4,fill = " orange")+# 设置bar的大小,填充色为绿色 geom_line(colour = "red",size = 1) + geom_point(colour = "red",size = 2,shape = 17) + geom_abline(slope = 0,intercept = overduerate,colour = "blue",size = 1) + # 添加水平线标识总体逾期率 geom_text(mapping = aes(label = GoodRate),size = 3,colour = 'black' ,vjust = -0.8,hjust = .5,position = position_dodge(0.9))+ labs(title = paste0(" Overdue Rate of ", picname," Bin"))+ scale_y_continuous(limits=c(0,1),breaks=seq(from=0,to=1,by=0.05))+mytheme return(pic) }}

保存图片

# ------------------------------***保存图片函数***------------------------------## ## 参数:## ---- pic:图片对象## ---- savepath:保存地址,默认目录,需要查验## ---- picname:图片名称save_pic_f <- function(pic,savepath = "./Picture",picname = "Rplot" ){ tryCatch( expr = {library(ggplot2)}, error = function(e) { print("You Need Install 'ggplot2' Packages First") install.packages("ggplot2");library(ggplot2) } ) ggsave( file = paste0(picname, ".png"), plot = pic,path = savepath, width = 5,height = 4,dpi = 600 )}

输出到Excel

# ------------------------------***Excel交互函数***------------------------------## ## 参数:## ---- wb_path:Excel: 存放目录## ---- wb_name:Excel: 名称## ---- create_sheet: 是否新建sheet## ---- sheet_name: 新建sheet名称or加载sheet名称## ---- pic_name: 需要保存的图片名称,默认png格式。此处无需后缀&路径## ---- pic_path: 图片的存放目录## ---- data: 需要保存的dataframe数据#### 测试方式####--1.载入已有数据,新建sheet,并且保存图片和数据# save2excel(wb_path = "./Docment/",# wb_name = "Statistics",# create_sheet = TRUE,# sheet_name = "test6",# pic_path = "./Picture/",# pic_name = "Rplot",data = data,# save_pic = TRUE,save_data = TRUE)save2excel <- function(wb_path = "./Document/", wb_name = "Statistics", create_sheet = TRUE, sheet_name = "Sheet", pic_path = "./Picture/", pic_name = "test",data, save_pic = TRUE,save_data = TRUE) { package_need <- c('tidyverse', 'rJava', 'xlsxjars', 'xlsx') package_install <- sapply(X=package_need, FUN = require, character.only = TRUE) if(!all(package_install)){ ## 有package没有安装 print(paste0("U Must install ",package_need[which(!package_install)])) } wb_complete_path <- paste0(wb_path,wb_name,".xlsx") pic_complete_path <- paste0(pic_path,pic_name,".png") # 先加载Excel wb <- loadWorkbook(wb_complete_path) if(create_sheet){ # 新建Sheet usesheet <- createSheet(wb, sheetName=sheet_name) }else{ # 加载现有Sheet usesheet <- eval(parse(text = paste0("getSheets(wb)$",sheet_name))) } if(save_pic){ # 添加图片 addPicture( file = pic_complete_path, sheet = usesheet, # 图片保存的sheet startRow = 15,# 在sheet中的位置 startColumn = 2 ) } if(save_data){ ## 设置样式 #### cs1 标题行 cs1 <- CellStyle(wb) + Alignment(horizontal = "ALIGN_CENTER", vertical = "VERTICAL_CENTER") + # 对齐方式,水平&竖直居中 Border( color = "black", position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"), pen = c("BORDER_THIN", "BORDER_THIN", "BORDER_THIN", "BORDER_THIN")) + # 所有边框加框线&黑色框线 Font(wb, isItalic = TRUE, isBold = TRUE)+ # 字体加粗、Italic字体 Fill( foregroundColor = "cornflowerblue", backgroundColor = "cornflowerblue", pattern = "SOLID_FOREGROUND" ) #背景色为矢车菊蓝 #### 第一列样式 cs2 <- CellStyle(wb) + Border( color = "black", position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"), pen = c("BORDER_THIN", "BORDER_THIN", "BORDER_THIN", "BORDER_THIN") )+#黑色细框线 Font(wb, isItalic = FALSE, isBold = TRUE)+ Fill( foregroundColor = "lightblue", backgroundColor = "lightblue", pattern = "SOLID_FOREGROUND" )# 浅蓝色填充 #### 非首列样式 cs3 <- CellStyle(wb) + Border( color = "black", position = c("TOP", "RIGHT" , "LEFT", "BOTTOM"), pen = c("BORDER_THIN", "BORDER_THIN", "BORDER_THIN", "BORDER_THIN") )#黑色细框线 list_name <- paste0("`", 2:ncol(data),"`") colStyle_str <- paste("list(`1`=cs2,", paste0(list_name,"=","cs3",collapse = ","),")") colStyle <- eval(parse(text = colStyle_str)) addDataFrame( x = data,# 要保存的数据框 sheet = usesheet,# 保存的位置 col.names = TRUE,# 是否保留列名 row.names = FALSE,# 是否保留行名 startRow = 1,#数据保存的起始位置 startColumn = 1,#数据保存的起始位置 colStyle= colStyle,# 第二、三列蓝色 colnamesStyle = cs1,# 列名的格式 rownamesStyle = cs1,# 行名的格式 showNA = FALSE,# 空值是否展示,默认不展示,即保留为空白格 characterNA = "",# NA展示位空字符串和上面showNA参数有关系 byrow = FALSE ) } saveWorkbook(wb, wb_complete_path)}

新建Excel

# ------------------------------***创建Excel函数***------------------------------## ## 参数:## ---- create_wb 是否创建Excel## ---- create_sheet 是否创建新Sheet## ---- wb_path Excel存放目录## ---- wb_name Excel名称## ---- sheet_name Sheet名称create_excel_f <- function(create_wb = FALSE, create_sheet = TRUE, wb_path = "./Docment/", wb_name = "Statistics", sheet_name = "Sheet") { package_need <- c('tidyverse', 'rJava', 'xlsxjars', 'xlsx') package_install <- sapply(X=package_need, FUN = require, character.only = TRUE) if(!all(package_install)){ ## 有package没有安装 print(paste0("U Must install ",package_need[which(!package_install)])) } if(create_wb){ ##新建Excel save_path <- paste0(wb_path,wb_name,".xlsx") # 新建Excel和Sheet wb <- createWorkbook() if(create_sheet){ sheet1 <- createSheet(wb, sheet_name) } #保存Excel saveWorkbook(wb, save_path) } }

2018-07-13 于南京市建邺区新城科技园

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:微信小程序模块化开发框架(微信小程序模块化开发框架设计)
下一篇:Megalo- 基于 Vue 的小程序开发框架(megalocyte同义词)
相关文章

 发表评论

暂时没有评论,来抢沙发吧~