# Dataframe extentions # -------- # # Dataframe.count_over_factor=function( Dataframe, counted, normalize=FALSE ){ # # map Dataframe to a Dataframe that counts values for counted data column # # pr_as_pretty_rate=function(x){ round(x*100,3) } # table(Dataframe[[counted]]) %>% { data.frame( value=names(.) %>% as.character(), count=as.numeric(.)) } %>% { if(normalize){ mutate(., count= round(.[["count"]]/sum(.[["count"]])*100, 3) ) } else { . } }%>% tidyr::pivot_wider( names_from = value, values_from=count ) } # Dataframe.order=function( Dataframe=db.contrataciones_normalizadas(), order= c("tipo_contrato", "salario") ){ # # map Dataframe to a Dataframe that counts values for counted data column # Dataframe %>% { .[, intersect( c(order, setdiff(names(.), order) ), names(.)) ] } } # # Dataframe.map_NAS=function( Dataframe=db.contrataciones_normalizadas(), mapped_to=0 ){ # # map Dataframe to a Dataframe that counts values for counted data column # Dataframe %>% { .[is.na(.) ]=mapped_to;. } } # #Dataframe.map_NAS() %>% View() # # basic_standardization=function( names ){ # tolower(names) %>% # iconv(.,from="UTF-8", sub="", to="ASCII//TRANSLIT") # } # stata_valid_names=function(names){ text2vec::word_tokenizer(names) %>% sapply(., function(some_name_units){ s=paste(some_name_units, collapse="_") s=stringr::str_replace_all(s, "[.]", "_") substr(s, 0, 25) }) %>% basic_standardization() } str_seq.replace_initial_numbers= function( str_seq=n ){ str_seq %>% sapply( function(str){ if( stringr::str_detect( str, "^[0-9]" )){ str=paste( "v_",str, sep="") };str } ) } # #str_seq.replace_initial_numbers() # # stata_valid_names(c("Variable_invalida1", # "Variable//.,invalida2", # "Variableinvalida3_con_nombre_super_largo", # "Variable_con_acentuación", # "nomnbre.invalido" # )) # Dataframe.apply_valid_names_for_stata=function( Dataframe=db.contrataciones_normalizadas() ){ names(Dataframe)=stata_valid_names(names(Dataframe)) %>% str_seq.replace_initial_numbers Dataframe } # # Dataframe.apply_valid_names_for_stata() %>% names() # Dataframe.mandatory_model=function( df, mandatory_model ){ df %>% dplyr::filter( df %>% dplyr::select( mandatory_model) %>% apply( MARGIN=1, function(row_data){ all(!is.na(row_data)) }) ) } # # # Dataframe.vars_as_character=function( Dataframe ){ Dataframe %>% lapply(function(data_col){ as.character(data_col) }) %>% as.data.frame() } # #Dataframe.vars_as_character() # Dataframe.reencode=function( Dataframe, FileEncoding="UTF-8" ){ # Dataframe %>% write.table("tempfile.txt") an= read.table("tempfile.txt", encoding="UTF-8") return(an) } # #Dataframe.reencode() # # Dataframe.aggregate=function( Dataframe, aggregated, label, Na.rm=TRUE ){ Dataframe %>% { dplyr::mutate( ., temp_var= { Dataframe %>% dplyr::select(aggregated) %>% apply(X=., MARGIN=1, FUN=function(row_data){ sum(row_data, na.rm = Na.rm) }) } ) } %>% { .[[label]]=.[["temp_var"]];. } %>% dplyr::select(-"temp_var") } # # Dataframe.aggregate( # Dataframe=db.indicadores_por_oferente(), # aggregated=c( # "aprendizaje", # "obra", # "otro", # "prest_de_servicios", # "temporal", # "termino_fijo", # "termino_indefinido"), # label="cualquier_contrato" # ) %>% View() # Dataframe.complain_for_vars=function( Dataframe, mandatory_vars ){ stopifnot( "some mandatory vars not found in Dataframe"= all( (mandatory_vars %in% names(Dataframe)) ) ) # Dataframe } # # Dataframe.complain_for_vars( # data.frame(), # mandatory_vars="unexistent" # ) # # Dataframe.complain_for_vars( # data.frame("existent1"="", "existent2"=""), # mandatory_vars="existent1" # ) # Dataframe.totalize=function( Dataframe, i_am_not_totalizable=NaN, Na.rm=TRUE, group_name_col=1 ){ # an= rbind(Dataframe, sapply(names(Dataframe),function(data_col){ ifelse( !(data_col %in% i_am_not_totalizable), sum(Dataframe[[data_col]] %>% as.numeric(), na.rm=Na.rm), "-" )})) if(is.numeric( group_name_col)){ an[nrow(an),group_name_col]="Totales" } an } # #Dataframe.totalize() # Dataframe.prefix=function( Dataframe, prefix="var_" ){ names(Dataframe)=paste(prefix, names(Dataframe), sep="") Dataframe } # Dataframe.new_names=function( Dataframe, new_names ){ names(Dataframe)=new_names;Dataframe } # Dataframe.count_values=function( Dataframe, counted ){ new_df= table(Dataframe[[counted]]) %>% as.data.frame() new_df[[1]]=as.character(new_df[[1]]) new_df } # Dataframe.insert=function( Dataframe, row_of_values ){ rbind(Dataframe, row_of_values) } Dataframe.apply_treshold=function( Dataframe, treshold=1 ){ Dataframe[Dataframe>=treshold]=1 Dataframe[Dataframe% dplyr::mutate( primary_key=paste(primary_key, .[[component]], sep="+" ) ) } new_df %>% dplyr::filter(!duplicated(primary_key)) %>% return(new_df) } # Dataframe.select_on_regex=function( Dataframe, selecting_regex ){ Dataframe %>% dplyr::select( grep(names(.), pattern=selecting_regex, value=TRUE) ) %>% return() } # Dataframe.delimite_dates=function( Dataframe, lower_date, upper_date ){ Dataframe %>% dplyr::filter( fecha > lower_date ) %>% dplyr::filter( fecha < upper_date ) %>% return() } # # regex associated behavour # -------- # Textual_feature.basic_standardization=function( names ){ # tolower(names) %>% iconv(to="ASCII//TRANSLIT") # } # i_am_my_name=function(x){ names(x)=x;x } # Dataframe.expand_regex_features=function( Dataframe, text_source, features=table(Dataframe[[text_source]]) %>% names() %>% i_am_my_name(), standardization=Textual_feature.basic_standardization, name_prefix ){ # state_df= Dataframe %>% dplyr::mutate( z_textual_source=standardization(.[[text_source]]) ) # for (feature_name in names(features)){ state_df[[sprintf("%s%s", name_prefix, feature_name)]]= ifelse( stringr::str_detect(state_df$z_textual_source, pattern=features[[feature_name]]),1,0) } state_df %>% dplyr::select(-"z_textual_source") } # # Dataframe.extract_regex_fields=function( Dataframe, text_source, features=table(Dataframe[[text_source]]) %>% names() %>% i_am_my_name(), standardization=Textual_feature.basic_standardization, name_prefix ){ # state_df= Dataframe %>% dplyr::mutate( z_textual_source=standardization(.[[text_source]]) ) # for (feature_name in names(features)){ state_df[[sprintf("%s%s", name_prefix, feature_name)]]= stringr::str_extract(state_df$z_textual_source, pattern=features[[feature_name]]) } state_df %>% dplyr::select(-"z_textual_source") } # Dataframe.fetch=function( Dataframe, fetched ){ Dataframe %>% { Dataframe[[fetched]] } } # Dataframe.totalize=function( Dataframe, lambda ){ list( Dataframe, lapply(Dataframe, function(col){ lambda(col) }) %>% as.data.frame() ) %>% bind_rows() }