close / dataframe extention.R
pajuan's picture
Upload 3 files
9493ec9 verified
# Dataframe extentions
# --------
#
#
Dataframe.count_over_factor=function(
Dataframe,
counted,
normalize=FALSE
){
#
# map Dataframe to a Dataframe that counts values for counted data column
#
#
pr_as_pretty_rate=function(x){
round(x*100,3)
}
#
table(Dataframe[[counted]]) %>%
{
data.frame(
value=names(.) %>% as.character(),
count=as.numeric(.))
} %>%
{
if(normalize){
mutate(., count=
round(.[["count"]]/sum(.[["count"]])*100, 3)
)
} else {
.
}
}%>%
tidyr::pivot_wider(
names_from = value,
values_from=count
)
}
#
Dataframe.order=function(
Dataframe=db.contrataciones_normalizadas(),
order= c("tipo_contrato", "salario")
){
#
# map Dataframe to a Dataframe that counts values for counted data column
#
Dataframe %>%
{
.[,
intersect( c(order, setdiff(names(.), order) ), names(.))
]
}
}
#
#
Dataframe.map_NAS=function(
Dataframe=db.contrataciones_normalizadas(),
mapped_to=0
){
#
# map Dataframe to a Dataframe that counts values for counted data column
#
Dataframe %>%
{
.[is.na(.)
]=mapped_to;.
}
}
#
#Dataframe.map_NAS() %>% View()
#
#
basic_standardization=function(
names
){
#
tolower(names) %>%
#
iconv(.,from="UTF-8", sub="", to="ASCII//TRANSLIT")
#
}
#
stata_valid_names=function(names){
text2vec::word_tokenizer(names) %>%
sapply(., function(some_name_units){
s=paste(some_name_units, collapse="_")
s=stringr::str_replace_all(s, "[.]", "_")
substr(s, 0, 25)
}) %>% basic_standardization()
}
str_seq.replace_initial_numbers=
function(
str_seq=n
){
str_seq %>%
sapply(
function(str){
if(
stringr::str_detect(
str, "^[0-9]"
)){
str=paste( "v_",str, sep="")
};str
}
)
}
#
#str_seq.replace_initial_numbers()
#
# stata_valid_names(c("Variable_invalida1",
# "Variable//.,invalida2",
# "Variableinvalida3_con_nombre_super_largo",
# "Variable_con_acentuación",
# "nomnbre.invalido"
# ))
#
Dataframe.apply_valid_names_for_stata=function(
Dataframe=db.contrataciones_normalizadas()
){
names(Dataframe)=stata_valid_names(names(Dataframe)) %>%
str_seq.replace_initial_numbers
Dataframe
}
#
# Dataframe.apply_valid_names_for_stata() %>% names()
#
Dataframe.mandatory_model=function(
df,
mandatory_model
){
df %>%
dplyr::filter(
df %>%
dplyr::select( mandatory_model) %>%
apply( MARGIN=1,
function(row_data){
all(!is.na(row_data))
})
)
}
#
#
#
Dataframe.vars_as_character=function(
Dataframe
){
Dataframe %>%
lapply(function(data_col){
as.character(data_col)
}) %>%
as.data.frame()
}
#
#Dataframe.vars_as_character()
#
Dataframe.reencode=function(
Dataframe,
FileEncoding="UTF-8"
){
#
Dataframe %>% write.table("tempfile.txt")
an= read.table("tempfile.txt", encoding="UTF-8")
return(an)
}
#
#Dataframe.reencode()
#
#
Dataframe.aggregate=function(
Dataframe,
aggregated,
label,
Na.rm=TRUE
){
Dataframe %>%
{
dplyr::mutate(
.,
temp_var=
{
Dataframe %>%
dplyr::select(aggregated) %>%
apply(X=.,
MARGIN=1,
FUN=function(row_data){
sum(row_data, na.rm = Na.rm)
})
}
)
} %>%
{
.[[label]]=.[["temp_var"]];.
} %>%
dplyr::select(-"temp_var")
}
#
# Dataframe.aggregate(
# Dataframe=db.indicadores_por_oferente(),
# aggregated=c(
# "aprendizaje",
# "obra",
# "otro",
# "prest_de_servicios",
# "temporal",
# "termino_fijo",
# "termino_indefinido"),
# label="cualquier_contrato"
# ) %>% View()
#
Dataframe.complain_for_vars=function(
Dataframe,
mandatory_vars
){
stopifnot(
"some mandatory vars not found in Dataframe"=
all( (mandatory_vars %in% names(Dataframe)) )
)
#
Dataframe
}
#
# Dataframe.complain_for_vars(
# data.frame(),
# mandatory_vars="unexistent"
# )
#
# Dataframe.complain_for_vars(
# data.frame("existent1"="", "existent2"=""),
# mandatory_vars="existent1"
# )
#
Dataframe.totalize=function(
Dataframe,
i_am_not_totalizable=NaN,
Na.rm=TRUE,
group_name_col=1
){
#
an=
rbind(Dataframe,
sapply(names(Dataframe),function(data_col){
ifelse(
!(data_col %in% i_am_not_totalizable),
sum(Dataframe[[data_col]] %>% as.numeric(), na.rm=Na.rm),
"-"
)}))
if(is.numeric( group_name_col)){
an[nrow(an),group_name_col]="Totales"
}
an
}
#
#Dataframe.totalize()
#
Dataframe.prefix=function(
Dataframe,
prefix="var_"
){
names(Dataframe)=paste(prefix, names(Dataframe), sep="")
Dataframe
}
#
Dataframe.new_names=function(
Dataframe,
new_names
){
names(Dataframe)=new_names;Dataframe
}
#
Dataframe.count_values=function(
Dataframe,
counted
){
new_df=
table(Dataframe[[counted]]) %>%
as.data.frame()
new_df[[1]]=as.character(new_df[[1]])
new_df
}
#
Dataframe.insert=function(
Dataframe,
row_of_values
){
rbind(Dataframe, row_of_values)
}
Dataframe.apply_treshold=function(
Dataframe,
treshold=1
){
Dataframe[Dataframe>=treshold]=1
Dataframe[Dataframe<treshold]=0
return(Dataframe)
}
#
Dataframe.alter_table_Dataframe_add_primary_key=function(
Dataframe,
primary_key_components
){
new_df=Dataframe
new_df$primary_key=""
for (component in primary_key_components){
new_df=
new_df %>%
dplyr::mutate(
primary_key=paste(primary_key, .[[component]], sep="+" )
)
}
new_df %>%
dplyr::filter(!duplicated(primary_key)) %>%
return(new_df)
}
#
Dataframe.select_on_regex=function(
Dataframe,
selecting_regex
){
Dataframe %>%
dplyr::select(
grep(names(.), pattern=selecting_regex, value=TRUE)
) %>%
return()
}
#
Dataframe.delimite_dates=function(
Dataframe,
lower_date,
upper_date
){
Dataframe %>%
dplyr::filter(
fecha > lower_date
) %>%
dplyr::filter(
fecha < upper_date
) %>%
return()
}
#
# regex associated behavour
# --------
#
Textual_feature.basic_standardization=function(
names
){
#
tolower(names) %>%
iconv(to="ASCII//TRANSLIT")
#
}
#
i_am_my_name=function(x){
names(x)=x;x
}
#
Dataframe.expand_regex_features=function(
Dataframe,
text_source,
features=table(Dataframe[[text_source]]) %>% names() %>% i_am_my_name(),
standardization=Textual_feature.basic_standardization,
name_prefix
){
#
state_df=
Dataframe %>%
dplyr::mutate(
z_textual_source=standardization(.[[text_source]])
)
#
for (feature_name in names(features)){
state_df[[sprintf("%s%s", name_prefix, feature_name)]]=
ifelse(
stringr::str_detect(state_df$z_textual_source, pattern=features[[feature_name]]),1,0)
}
state_df %>%
dplyr::select(-"z_textual_source")
}
#
#
Dataframe.extract_regex_fields=function(
Dataframe,
text_source,
features=table(Dataframe[[text_source]]) %>% names() %>% i_am_my_name(),
standardization=Textual_feature.basic_standardization,
name_prefix
){
#
state_df=
Dataframe %>%
dplyr::mutate(
z_textual_source=standardization(.[[text_source]])
)
#
for (feature_name in names(features)){
state_df[[sprintf("%s%s", name_prefix, feature_name)]]=
stringr::str_extract(state_df$z_textual_source, pattern=features[[feature_name]])
}
state_df %>%
dplyr::select(-"z_textual_source")
}
#
Dataframe.fetch=function(
Dataframe,
fetched
){
Dataframe %>%
{
Dataframe[[fetched]]
}
}
#
Dataframe.totalize=function(
Dataframe,
lambda
){
list(
Dataframe,
lapply(Dataframe, function(col){
lambda(col)
}) %>% as.data.frame()
) %>%
bind_rows()
}