|
|
|
|
|
|
|
|
|
|
|
Dataframe.count_over_factor=function(
|
|
Dataframe,
|
|
counted,
|
|
normalize=FALSE
|
|
){
|
|
|
|
|
|
|
|
|
|
|
|
pr_as_pretty_rate=function(x){
|
|
round(x*100,3)
|
|
}
|
|
|
|
table(Dataframe[[counted]]) %>%
|
|
{
|
|
data.frame(
|
|
value=names(.) %>% as.character(),
|
|
count=as.numeric(.))
|
|
} %>%
|
|
{
|
|
if(normalize){
|
|
|
|
mutate(., count=
|
|
round(.[["count"]]/sum(.[["count"]])*100, 3)
|
|
)
|
|
|
|
} else {
|
|
.
|
|
}
|
|
|
|
}%>%
|
|
tidyr::pivot_wider(
|
|
names_from = value,
|
|
values_from=count
|
|
)
|
|
}
|
|
|
|
Dataframe.order=function(
|
|
Dataframe=db.contrataciones_normalizadas(),
|
|
order= c("tipo_contrato", "salario")
|
|
){
|
|
|
|
|
|
|
|
|
|
Dataframe %>%
|
|
{
|
|
.[,
|
|
intersect( c(order, setdiff(names(.), order) ), names(.))
|
|
]
|
|
}
|
|
}
|
|
|
|
|
|
Dataframe.map_NAS=function(
|
|
Dataframe=db.contrataciones_normalizadas(),
|
|
mapped_to=0
|
|
){
|
|
|
|
|
|
|
|
|
|
Dataframe %>%
|
|
{
|
|
.[is.na(.)
|
|
]=mapped_to;.
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
basic_standardization=function(
|
|
names
|
|
){
|
|
|
|
tolower(names) %>%
|
|
|
|
iconv(.,from="UTF-8", sub="", to="ASCII//TRANSLIT")
|
|
|
|
}
|
|
|
|
stata_valid_names=function(names){
|
|
text2vec::word_tokenizer(names) %>%
|
|
sapply(., function(some_name_units){
|
|
s=paste(some_name_units, collapse="_")
|
|
s=stringr::str_replace_all(s, "[.]", "_")
|
|
substr(s, 0, 25)
|
|
}) %>% basic_standardization()
|
|
}
|
|
|
|
str_seq.replace_initial_numbers=
|
|
function(
|
|
str_seq=n
|
|
){
|
|
str_seq %>%
|
|
sapply(
|
|
function(str){
|
|
if(
|
|
stringr::str_detect(
|
|
str, "^[0-9]"
|
|
)){
|
|
str=paste( "v_",str, sep="")
|
|
};str
|
|
|
|
}
|
|
)
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dataframe.apply_valid_names_for_stata=function(
|
|
Dataframe=db.contrataciones_normalizadas()
|
|
){
|
|
names(Dataframe)=stata_valid_names(names(Dataframe)) %>%
|
|
str_seq.replace_initial_numbers
|
|
Dataframe
|
|
}
|
|
|
|
|
|
|
|
Dataframe.mandatory_model=function(
|
|
df,
|
|
mandatory_model
|
|
){
|
|
df %>%
|
|
dplyr::filter(
|
|
df %>%
|
|
dplyr::select( mandatory_model) %>%
|
|
apply( MARGIN=1,
|
|
function(row_data){
|
|
all(!is.na(row_data))
|
|
})
|
|
)
|
|
}
|
|
|
|
|
|
|
|
Dataframe.vars_as_character=function(
|
|
Dataframe
|
|
){
|
|
Dataframe %>%
|
|
lapply(function(data_col){
|
|
as.character(data_col)
|
|
}) %>%
|
|
as.data.frame()
|
|
}
|
|
|
|
|
|
|
|
Dataframe.reencode=function(
|
|
Dataframe,
|
|
FileEncoding="UTF-8"
|
|
){
|
|
|
|
Dataframe %>% write.table("tempfile.txt")
|
|
an= read.table("tempfile.txt", encoding="UTF-8")
|
|
return(an)
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dataframe.aggregate=function(
|
|
Dataframe,
|
|
aggregated,
|
|
label,
|
|
Na.rm=TRUE
|
|
){
|
|
|
|
|
|
Dataframe %>%
|
|
{
|
|
dplyr::mutate(
|
|
.,
|
|
temp_var=
|
|
{
|
|
Dataframe %>%
|
|
dplyr::select(aggregated) %>%
|
|
apply(X=.,
|
|
MARGIN=1,
|
|
FUN=function(row_data){
|
|
sum(row_data, na.rm = Na.rm)
|
|
})
|
|
}
|
|
)
|
|
} %>%
|
|
{
|
|
.[[label]]=.[["temp_var"]];.
|
|
|
|
} %>%
|
|
dplyr::select(-"temp_var")
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dataframe.complain_for_vars=function(
|
|
Dataframe,
|
|
mandatory_vars
|
|
){
|
|
stopifnot(
|
|
"some mandatory vars not found in Dataframe"=
|
|
all( (mandatory_vars %in% names(Dataframe)) )
|
|
)
|
|
|
|
Dataframe
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dataframe.totalize=function(
|
|
Dataframe,
|
|
i_am_not_totalizable=NaN,
|
|
Na.rm=TRUE,
|
|
group_name_col=1
|
|
){
|
|
|
|
an=
|
|
rbind(Dataframe,
|
|
sapply(names(Dataframe),function(data_col){
|
|
ifelse(
|
|
!(data_col %in% i_am_not_totalizable),
|
|
sum(Dataframe[[data_col]] %>% as.numeric(), na.rm=Na.rm),
|
|
"-"
|
|
)}))
|
|
if(is.numeric( group_name_col)){
|
|
an[nrow(an),group_name_col]="Totales"
|
|
}
|
|
an
|
|
}
|
|
|
|
|
|
|
|
|
|
Dataframe.prefix=function(
|
|
Dataframe,
|
|
prefix="var_"
|
|
){
|
|
names(Dataframe)=paste(prefix, names(Dataframe), sep="")
|
|
Dataframe
|
|
}
|
|
|
|
|
|
Dataframe.new_names=function(
|
|
Dataframe,
|
|
new_names
|
|
){
|
|
names(Dataframe)=new_names;Dataframe
|
|
}
|
|
|
|
|
|
Dataframe.count_values=function(
|
|
Dataframe,
|
|
counted
|
|
){
|
|
new_df=
|
|
table(Dataframe[[counted]]) %>%
|
|
as.data.frame()
|
|
new_df[[1]]=as.character(new_df[[1]])
|
|
new_df
|
|
}
|
|
|
|
|
|
Dataframe.insert=function(
|
|
Dataframe,
|
|
row_of_values
|
|
){
|
|
rbind(Dataframe, row_of_values)
|
|
}
|
|
|
|
Dataframe.apply_treshold=function(
|
|
Dataframe,
|
|
treshold=1
|
|
){
|
|
Dataframe[Dataframe>=treshold]=1
|
|
Dataframe[Dataframe<treshold]=0
|
|
return(Dataframe)
|
|
}
|
|
|
|
|
|
Dataframe.alter_table_Dataframe_add_primary_key=function(
|
|
Dataframe,
|
|
primary_key_components
|
|
){
|
|
new_df=Dataframe
|
|
new_df$primary_key=""
|
|
|
|
for (component in primary_key_components){
|
|
new_df=
|
|
new_df %>%
|
|
dplyr::mutate(
|
|
primary_key=paste(primary_key, .[[component]], sep="+" )
|
|
)
|
|
}
|
|
|
|
new_df %>%
|
|
dplyr::filter(!duplicated(primary_key)) %>%
|
|
return(new_df)
|
|
}
|
|
|
|
|
|
Dataframe.select_on_regex=function(
|
|
Dataframe,
|
|
selecting_regex
|
|
){
|
|
Dataframe %>%
|
|
dplyr::select(
|
|
grep(names(.), pattern=selecting_regex, value=TRUE)
|
|
) %>%
|
|
return()
|
|
}
|
|
|
|
|
|
|
|
Dataframe.delimite_dates=function(
|
|
Dataframe,
|
|
lower_date,
|
|
upper_date
|
|
){
|
|
Dataframe %>%
|
|
|
|
dplyr::filter(
|
|
fecha > lower_date
|
|
) %>%
|
|
dplyr::filter(
|
|
fecha < upper_date
|
|
) %>%
|
|
|
|
return()
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Textual_feature.basic_standardization=function(
|
|
names
|
|
){
|
|
|
|
tolower(names) %>%
|
|
iconv(to="ASCII//TRANSLIT")
|
|
|
|
}
|
|
|
|
i_am_my_name=function(x){
|
|
names(x)=x;x
|
|
}
|
|
|
|
Dataframe.expand_regex_features=function(
|
|
Dataframe,
|
|
text_source,
|
|
features=table(Dataframe[[text_source]]) %>% names() %>% i_am_my_name(),
|
|
standardization=Textual_feature.basic_standardization,
|
|
name_prefix
|
|
|
|
){
|
|
|
|
state_df=
|
|
Dataframe %>%
|
|
dplyr::mutate(
|
|
z_textual_source=standardization(.[[text_source]])
|
|
)
|
|
|
|
for (feature_name in names(features)){
|
|
state_df[[sprintf("%s%s", name_prefix, feature_name)]]=
|
|
ifelse(
|
|
stringr::str_detect(state_df$z_textual_source, pattern=features[[feature_name]]),1,0)
|
|
}
|
|
state_df %>%
|
|
dplyr::select(-"z_textual_source")
|
|
}
|
|
|
|
|
|
Dataframe.extract_regex_fields=function(
|
|
Dataframe,
|
|
text_source,
|
|
features=table(Dataframe[[text_source]]) %>% names() %>% i_am_my_name(),
|
|
standardization=Textual_feature.basic_standardization,
|
|
name_prefix
|
|
){
|
|
|
|
state_df=
|
|
Dataframe %>%
|
|
dplyr::mutate(
|
|
z_textual_source=standardization(.[[text_source]])
|
|
)
|
|
|
|
for (feature_name in names(features)){
|
|
state_df[[sprintf("%s%s", name_prefix, feature_name)]]=
|
|
stringr::str_extract(state_df$z_textual_source, pattern=features[[feature_name]])
|
|
}
|
|
state_df %>%
|
|
dplyr::select(-"z_textual_source")
|
|
}
|
|
|
|
|
|
Dataframe.fetch=function(
|
|
Dataframe,
|
|
fetched
|
|
){
|
|
Dataframe %>%
|
|
{
|
|
Dataframe[[fetched]]
|
|
}
|
|
}
|
|
|
|
|
|
Dataframe.totalize=function(
|
|
Dataframe,
|
|
lambda
|
|
){
|
|
list(
|
|
Dataframe,
|
|
lapply(Dataframe, function(col){
|
|
lambda(col)
|
|
}) %>% as.data.frame()
|
|
) %>%
|
|
bind_rows()
|
|
} |