C2MV commited on
Commit
c6d4a29
·
verified ·
1 Parent(s): ec889f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +685 -782
app.py CHANGED
@@ -1,826 +1,729 @@
1
- import numpy as np
2
- import pandas as pd
3
- import statsmodels.formula.api as smf
4
- import statsmodels.api as sm
5
- import plotly.graph_objects as go
6
- from scipy.optimize import minimize
7
- import plotly.express as px
8
- from scipy.stats import t, f
 
9
  import gradio as gr
 
10
  import io
11
- import zipfile
12
- import tempfile
13
- from datetime import datetime
14
-
15
- class RSM_BoxBehnken:
16
- def __init__(self, data, x1_name, x2_name, x3_name, y_name, x1_levels, x2_levels, x3_levels):
17
- """
18
- Inicializa la clase con los datos del diseño Box-Behnken.
19
- """
20
- self.data = data.copy()
21
- self.model = None
22
- self.model_simplified = None
23
- self.optimized_results = None
24
- self.optimal_levels = None
25
- self.all_figures = [] # Lista para almacenar las figuras
26
- self.x1_name = x1_name
27
- self.x2_name = x2_name
28
- self.x3_name = x3_name
29
- self.y_name = y_name
30
-
31
- # Niveles originales de las variables
32
- self.x1_levels = x1_levels
33
- self.x2_levels = x2_levels
34
- self.x3_levels = x3_levels
35
-
36
- def get_levels(self, variable_name):
37
- """
38
- Obtiene los niveles para una variable específica.
39
- """
40
- if variable_name == self.x1_name:
41
- return self.x1_levels
42
- elif variable_name == self.x2_name:
43
- return self.x2_levels
44
- elif variable_name == self.x3_name:
45
- return self.x3_levels
46
- else:
47
- raise ValueError(f"Variable desconocida: {variable_name}")
48
-
49
- def fit_model(self):
50
- """
51
- Ajusta el modelo de segundo orden completo a los datos.
52
- """
53
- formula = f'{self.y_name} ~ {self.x1_name} + {self.x2_name} + {self.x3_name} + ' \
54
- f'I({self.x1_name}**2) + I({self.x2_name}**2) + I({self.x3_name}**2) + ' \
55
- f'{self.x1_name}:{self.x2_name} + {self.x1_name}:{self.x3_name} + {self.x2_name}:{self.x3_name}'
56
- self.model = smf.ols(formula, data=self.data).fit()
57
- print("Modelo Completo:")
58
- print(self.model.summary())
59
- return self.model, self.pareto_chart(self.model, "Pareto - Modelo Completo")
60
-
61
- def fit_simplified_model(self):
62
- """
63
- Ajusta el modelo de segundo orden a los datos, eliminando términos no significativos.
64
- """
65
- formula = f'{self.y_name} ~ {self.x1_name} + {self.x2_name} + ' \
66
- f'I({self.x1_name}**2) + I({self.x2_name}**2) + I({self.x3_name}**2)'
67
- self.model_simplified = smf.ols(formula, data=self.data).fit()
68
- print("\nModelo Simplificado:")
69
- print(self.model_simplified.summary())
70
- return self.model_simplified, self.pareto_chart(self.model_simplified, "Pareto - Modelo Simplificado")
71
-
72
- def optimize(self, method='Nelder-Mead'):
73
- """
74
- Encuentra los niveles óptimos de los factores para maximizar la respuesta usando el modelo simplificado.
75
- """
76
- if self.model_simplified is None:
77
- print("Error: Ajusta el modelo simplificado primero.")
78
- return
79
-
80
- def objective_function(x):
81
- return -self.model_simplified.predict(pd.DataFrame({
82
- self.x1_name: [x[0]],
83
- self.x2_name: [x[1]],
84
- self.x3_name: [x[2]]
85
- })).values[0]
86
-
87
- bounds = [(-1, 1), (-1, 1), (-1, 1)]
88
- x0 = [0, 0, 0]
89
-
90
- self.optimized_results = minimize(objective_function, x0, method=method, bounds=bounds)
91
- self.optimal_levels = self.optimized_results.x
92
-
93
- # Convertir niveles óptimos de codificados a naturales
94
- optimal_levels_natural = [
95
- self.coded_to_natural(self.optimal_levels[0], self.x1_name),
96
- self.coded_to_natural(self.optimal_levels[1], self.x2_name),
97
- self.coded_to_natural(self.optimal_levels[2], self.x3_name)
98
- ]
99
- # Crear la tabla de optimización
100
- optimization_table = pd.DataFrame({
101
- 'Variable': [self.x1_name, self.x2_name, self.x3_name],
102
- 'Nivel Óptimo (Natural)': optimal_levels_natural,
103
- 'Nivel Óptimo (Codificado)': self.optimal_levels
104
- })
105
-
106
- return optimization_table.round(3) # Redondear a 3 decimales
107
-
108
- def plot_rsm_individual(self, fixed_variable, fixed_level):
109
- """
110
- Genera un gráfico de superficie de respuesta (RSM) individual para una configuración específica.
111
- """
112
- if self.model_simplified is None:
113
- print("Error: Ajusta el modelo simplificado primero.")
114
- return None
115
-
116
- # Determinar las variables que varían y sus niveles naturales
117
- varying_variables = [var for var in [self.x1_name, self.x2_name, self.x3_name] if var != fixed_variable]
118
-
119
- # Establecer los niveles naturales para las variables que varían
120
- x_natural_levels = self.get_levels(varying_variables[0])
121
- y_natural_levels = self.get_levels(varying_variables[1])
122
-
123
- # Crear una malla de puntos para las variables que varían (en unidades naturales)
124
- x_range_natural = np.linspace(x_natural_levels[0], x_natural_levels[-1], 100)
125
- y_range_natural = np.linspace(y_natural_levels[0], y_natural_levels[-1], 100)
126
- x_grid_natural, y_grid_natural = np.meshgrid(x_range_natural, y_range_natural)
127
-
128
- # Convertir la malla de variables naturales a codificadas
129
- x_grid_coded = self.natural_to_coded(x_grid_natural, varying_variables[0])
130
- y_grid_coded = self.natural_to_coded(y_grid_natural, varying_variables[1])
131
-
132
- # Crear un DataFrame para la predicción con variables codificadas
133
- prediction_data = pd.DataFrame({
134
- varying_variables[0]: x_grid_coded.flatten(),
135
- varying_variables[1]: y_grid_coded.flatten(),
136
- })
137
- prediction_data[fixed_variable] = self.natural_to_coded(fixed_level, fixed_variable)
138
-
139
- # Calcular los valores predichos
140
- z_pred = self.model_simplified.predict(prediction_data).values.reshape(x_grid_coded.shape)
141
-
142
- # Filtrar por el nivel de la variable fija (en codificado)
143
- fixed_level_coded = self.natural_to_coded(fixed_level, fixed_variable)
144
- subset_data = self.data[np.isclose(self.data[fixed_variable], fixed_level_coded)]
145
-
146
- # Filtrar por niveles válidos en las variables que varían
147
- valid_levels = [-1, 0, 1]
148
- experiments_data = subset_data[
149
- subset_data[varying_variables[0]].isin(valid_levels) &
150
- subset_data[varying_variables[1]].isin(valid_levels)
151
  ]
152
 
153
- # Convertir coordenadas de experimentos a naturales
154
- experiments_x_natural = experiments_data[varying_variables[0]].apply(lambda x: self.coded_to_natural(x, varying_variables[0]))
155
- experiments_y_natural = experiments_data[varying_variables[1]].apply(lambda x: self.coded_to_natural(x, varying_variables[1]))
156
-
157
- # Crear el gráfico de superficie con variables naturales en los ejes y transparencia
158
- fig = go.Figure(data=[go.Surface(z=z_pred, x=x_grid_natural, y=y_grid_natural, colorscale='Viridis', opacity=0.7, showscale=True)])
159
-
160
- # --- Añadir cuadrícula a la superficie ---
161
- # Líneas en la dirección x
162
- for i in range(x_grid_natural.shape[0]):
163
- fig.add_trace(go.Scatter3d(
164
- x=x_grid_natural[i, :],
165
- y=y_grid_natural[i, :],
166
- z=z_pred[i, :],
167
- mode='lines',
168
- line=dict(color='gray', width=2),
169
- showlegend=False,
170
- hoverinfo='skip'
171
- ))
172
- # Líneas en la dirección y
173
- for j in range(x_grid_natural.shape[1]):
174
- fig.add_trace(go.Scatter3d(
175
- x=x_grid_natural[:, j],
176
- y=y_grid_natural[:, j],
177
- z=z_pred[:, j],
178
- mode='lines',
179
- line=dict(color='gray', width=2),
180
- showlegend=False,
181
- hoverinfo='skip'
182
- ))
183
-
184
- # --- Fin de la adición de la cuadrícula ---
185
-
186
- # Añadir los puntos de los experimentos en la superficie de respuesta con diferentes colores y etiquetas
187
- colors = px.colors.qualitative.Safe
188
- point_labels = [f"{row[self.y_name]:.3f}" for _, row in experiments_data.iterrows()]
189
-
190
- fig.add_trace(go.Scatter3d(
191
- x=experiments_x_natural,
192
- y=experiments_y_natural,
193
- z=experiments_data[self.y_name].round(3),
194
- mode='markers+text',
195
- marker=dict(size=4, color=colors[:len(experiments_x_natural)]),
196
- text=point_labels,
197
- textposition='top center',
198
- name='Experimentos'
199
- ))
200
-
201
- # Añadir etiquetas y título con variables naturales
202
- fig.update_layout(
203
- scene=dict(
204
- xaxis_title=f"{varying_variables[0]} ({self.get_units(varying_variables[0])})",
205
- yaxis_title=f"{varying_variables[1]} ({self.get_units(varying_variables[1])})",
206
- zaxis_title=self.y_name,
207
- ),
208
- title=f"{self.y_name} vs {varying_variables[0]} y {varying_variables[1]}<br><sup>{fixed_variable} fijo en {fixed_level:.3f} ({self.get_units(fixed_variable)}) (Modelo Simplificado)</sup>",
209
- height=800,
210
- width=1000,
211
- showlegend=True
212
- )
213
- return fig
214
-
215
- def get_units(self, variable_name):
216
- """
217
- Define las unidades de las variables para etiquetas.
218
- Puedes personalizar este método según tus necesidades.
219
- """
220
- units = {
221
- 'Glucosa': 'g/L',
222
- 'Extracto_de_Levadura': 'g/L',
223
- 'Triptofano': 'g/L',
224
- 'AIA_ppm': 'ppm'
225
- }
226
- return units.get(variable_name, '')
227
-
228
- def generate_all_plots(self):
229
- """
230
- Genera todas las gráficas de RSM, variando la variable fija y sus niveles usando el modelo simplificado.
231
- Almacena las figuras en self.all_figures.
232
- """
233
- if self.model_simplified is None:
234
- print("Error: Ajusta el modelo simplificado primero.")
235
- return
236
-
237
- self.all_figures = [] # Resetear la lista de figuras
238
-
239
- # Niveles naturales para graficar
240
- levels_to_plot_natural = {
241
- self.x1_name: self.x1_levels,
242
- self.x2_name: self.x2_levels,
243
- self.x3_name: self.x3_levels
244
  }
245
 
246
- # Generar y almacenar gráficos individuales
247
- for fixed_variable in [self.x1_name, self.x2_name, self.x3_name]:
248
- for level in levels_to_plot_natural[fixed_variable]:
249
- fig = self.plot_rsm_individual(fixed_variable, level)
250
- if fig is not None:
251
- self.all_figures.append(fig)
252
-
253
- def coded_to_natural(self, coded_value, variable_name):
254
- """Convierte un valor codificado a su valor natural."""
255
- levels = self.get_levels(variable_name)
256
- return levels[0] + (coded_value + 1) * (levels[-1] - levels[0]) / 2
257
-
258
- def natural_to_coded(self, natural_value, variable_name):
259
- """Convierte un valor natural a su valor codificado."""
260
- levels = self.get_levels(variable_name)
261
- return -1 + 2 * (natural_value - levels[0]) / (levels[-1] - levels[0])
262
-
263
- def pareto_chart(self, model, title):
264
- """
265
- Genera un diagrama de Pareto para los efectos estandarizados de un modelo,
266
- incluyendo la línea de significancia.
267
- """
268
- # Calcular los efectos estandarizados
269
- tvalues = model.tvalues[1:] # Excluir la Intercept
270
- abs_tvalues = np.abs(tvalues)
271
- sorted_idx = np.argsort(abs_tvalues)[::-1]
272
- sorted_tvalues = abs_tvalues[sorted_idx]
273
- sorted_names = tvalues.index[sorted_idx]
274
-
275
- # Calcular el valor crítico de t para la línea de significancia
276
- alpha = 0.05 # Nivel de significancia
277
- dof = model.df_resid # Grados de libertad residuales
278
- t_critical = t.ppf(1 - alpha / 2, dof)
279
-
280
- # Crear el diagrama de Pareto
281
- fig = px.bar(
282
- x=sorted_tvalues.round(3),
283
- y=sorted_names,
284
- orientation='h',
285
- labels={'x': 'Efecto Estandarizado', 'y': 'Término'},
286
- title=title
287
- )
288
- fig.update_yaxes(autorange="reversed")
289
-
290
- # Agregar la línea de significancia
291
- fig.add_vline(x=t_critical, line_dash="dot",
292
- annotation_text=f"t crítico = {t_critical:.3f}",
293
- annotation_position="bottom right")
294
-
295
- return fig
296
-
297
- def get_simplified_equation(self):
298
- """
299
- Imprime la ecuación del modelo simplificado.
300
- """
301
- if self.model_simplified is None:
302
- print("Error: Ajusta el modelo simplificado primero.")
303
  return None
304
 
305
- coefficients = self.model_simplified.params
306
- equation = f"{self.y_name} = {coefficients['Intercept']:.3f}"
307
-
308
- for term, coef in coefficients.items():
309
- if term != 'Intercept':
310
- if term == f'{self.x1_name}':
311
- equation += f" + {coef:.3f}*{self.x1_name}"
312
- elif term == f'{self.x2_name}':
313
- equation += f" + {coef:.3f}*{self.x2_name}"
314
- elif term == f'{self.x3_name}':
315
- equation += f" + {coef:.3f}*{self.x3_name}"
316
- elif term == f'I({self.x1_name} ** 2)':
317
- equation += f" + {coef:.3f}*{self.x1_name}^2"
318
- elif term == f'I({self.x2_name} ** 2)':
319
- equation += f" + {coef:.3f}*{self.x2_name}^2"
320
- elif term == f'I({self.x3_name} ** 2)':
321
- equation += f" + {coef:.3f}*{self.x3_name}^2"
322
-
323
- return equation
324
-
325
- def generate_prediction_table(self):
326
- """
327
- Genera una tabla con los valores actuales, predichos y residuales.
328
- """
329
- if self.model_simplified is None:
330
- print("Error: Ajusta el modelo simplificado primero.")
 
 
 
331
  return None
332
 
333
- self.data['Predicho'] = self.model_simplified.predict(self.data)
334
- self.data['Residual'] = self.data[self.y_name] - self.data['Predicho']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
- return self.data[[self.y_name, 'Predicho', 'Residual']].round(3)
337
 
338
- def calculate_contribution_percentage(self):
339
- """
340
- Calcula el porcentaje de contribución de cada factor a la variabilidad de la respuesta (AIA).
341
- """
342
- if self.model_simplified is None:
343
- print("Error: Ajusta el modelo simplificado primero.")
344
  return None
345
 
346
- # ANOVA del modelo simplificado
347
- anova_table = sm.stats.anova_lm(self.model_simplified, typ=2)
348
-
349
- # Suma de cuadrados total
350
- ss_total = anova_table['sum_sq'].sum()
351
-
352
- # Crear tabla de contribución
353
- contribution_table = pd.DataFrame({
354
- 'Factor': [],
355
- 'Suma de Cuadrados': [],
356
- '% Contribución': []
357
- })
358
-
359
- # Calcular porcentaje de contribución para cada factor
360
- for index, row in anova_table.iterrows():
361
- if index != 'Residual':
362
- factor_name = index
363
- if factor_name == f'I({self.x1_name} ** 2)':
364
- factor_name = f'{self.x1_name}^2'
365
- elif factor_name == f'I({self.x2_name} ** 2)':
366
- factor_name = f'{self.x2_name}^2'
367
- elif factor_name == f'I({self.x3_name} ** 2)':
368
- factor_name = f'{self.x3_name}^2'
369
-
370
- ss_factor = row['sum_sq']
371
- contribution_percentage = (ss_factor / ss_total) * 100
372
-
373
- contribution_table = pd.concat([contribution_table, pd.DataFrame({
374
- 'Factor': [factor_name],
375
- 'Suma de Cuadrados': [ss_factor],
376
- '% Contribución': [contribution_percentage]
377
- })], ignore_index=True)
378
-
379
- return contribution_table.round(3)
380
-
381
- def calculate_detailed_anova(self):
382
- """
383
- Calcula la tabla ANOVA detallada con la descomposición del error residual.
384
- """
385
- if self.model_simplified is None:
386
- print("Error: Ajusta el modelo simplificado primero.")
387
- return None
388
 
389
- # --- ANOVA detallada ---
390
- # 1. Ajustar un modelo solo con los términos de primer orden y cuadráticos
391
- formula_reduced = f'{self.y_name} ~ {self.x1_name} + {self.x2_name} + {self.x3_name} + ' \
392
- f'I({self.x1_name}**2) + I({self.x2_name}**2) + I({self.x3_name}**2)'
393
- model_reduced = smf.ols(formula_reduced, data=self.data).fit()
394
 
395
- # 2. ANOVA del modelo reducido (para obtener la suma de cuadrados de la regresión)
396
- anova_reduced = sm.stats.anova_lm(model_reduced, typ=2)
397
 
398
- # 3. Suma de cuadrados total
399
- ss_total = np.sum((self.data[self.y_name] - self.data[self.y_name].mean())**2)
400
 
401
- # 4. Grados de libertad totales
402
- df_total = len(self.data) - 1
 
 
 
 
 
 
 
 
403
 
404
- # 5. Suma de cuadrados de la regresión
405
- ss_regression = anova_reduced['sum_sq'][:-1].sum() # Sumar todo excepto 'Residual'
 
 
406
 
407
- # 6. Grados de libertad de la regresión
408
- df_regression = len(anova_reduced) - 1
 
 
409
 
410
- # 7. Suma de cuadrados del error residual
411
- ss_residual = self.model_simplified.ssr
412
- df_residual = self.model_simplified.df_resid
413
 
414
- # 8. Suma de cuadrados del error puro (se calcula a partir de las réplicas)
415
- replicas = self.data[self.data.duplicated(subset=[self.x1_name, self.x2_name, self.x3_name], keep=False)]
416
- if not replicas.empty:
417
- ss_pure_error = replicas.groupby([self.x1_name, self.x2_name, self.x3_name])[self.y_name].var().sum() * replicas.groupby([self.x1_name, self.x2_name, self.x3_name]).ngroups
418
- df_pure_error = len(replicas) - replicas.groupby([self.x1_name, self.x2_name, self.x3_name]).ngroups
419
- else:
420
- ss_pure_error = np.nan
421
- df_pure_error = np.nan
422
-
423
- # 9. Suma de cuadrados de la falta de ajuste
424
- ss_lack_of_fit = ss_residual - ss_pure_error if not np.isnan(ss_pure_error) else np.nan
425
- df_lack_of_fit = df_residual - df_pure_error if not np.isnan(df_pure_error) else np.nan
426
-
427
- # 10. Cuadrados medios
428
- ms_regression = ss_regression / df_regression
429
- ms_residual = ss_residual / df_residual
430
- ms_lack_of_fit = ss_lack_of_fit / df_lack_of_fit if not np.isnan(ss_lack_of_fit) else np.nan
431
- ms_pure_error = ss_pure_error / df_pure_error if not np.isnan(ss_pure_error) else np.nan
432
-
433
- # 11. Estadístico F y valor p para la falta de ajuste
434
- f_lack_of_fit = ms_lack_of_fit / ms_pure_error if not np.isnan(ms_lack_of_fit) else np.nan
435
- p_lack_of_fit = 1 - f.cdf(f_lack_of_fit, df_lack_of_fit, df_pure_error) if not np.isnan(f_lack_of_fit) else np.nan
436
-
437
- # 12. Crear la tabla ANOVA detallada
438
- detailed_anova_table = pd.DataFrame({
439
- 'Fuente de Variación': ['Regresión', 'Residual', 'Falta de Ajuste', 'Error Puro', 'Total'],
440
- 'Suma de Cuadrados': [ss_regression, ss_residual, ss_lack_of_fit, ss_pure_error, ss_total],
441
- 'Grados de Libertad': [df_regression, df_residual, df_lack_of_fit, df_pure_error, df_total],
442
- 'Cuadrado Medio': [ms_regression, ms_residual, ms_lack_of_fit, ms_pure_error, np.nan],
443
- 'F': [np.nan, np.nan, f_lack_of_fit, np.nan, np.nan],
444
- 'Valor p': [np.nan, np.nan, p_lack_of_fit, np.nan, np.nan]
445
- })
446
-
447
- # Calcular la suma de cuadrados y grados de libertad para la curvatura
448
- ss_curvature = anova_reduced['sum_sq'][f'I({self.x1_name} ** 2)'] + anova_reduced['sum_sq'][f'I({self.x2_name} ** 2)'] + anova_reduced['sum_sq'][f'I({self.x3_name} ** 2)']
449
- df_curvature = 3
450
-
451
- # Añadir la fila de curvatura a la tabla ANOVA
452
- detailed_anova_table.loc[len(detailed_anova_table)] = ['Curvatura', ss_curvature, df_curvature, ss_curvature / df_curvature, np.nan, np.nan]
453
-
454
- # Reorganizar las filas para que la curvatura aparezca después de la regresión
455
- detailed_anova_table = detailed_anova_table.reindex([0, 5, 1, 2, 3, 4])
456
-
457
- # Resetear el índice para que sea consecutivo
458
- detailed_anova_table = detailed_anova_table.reset_index(drop=True)
459
-
460
- return detailed_anova_table.round(3)
461
-
462
- def get_all_tables(self):
463
- """
464
- Obtiene todas las tablas generadas para ser exportadas a Excel.
465
- """
466
- prediction_table = self.generate_prediction_table()
467
- contribution_table = self.calculate_contribution_percentage()
468
- detailed_anova_table = self.calculate_detailed_anova()
469
-
470
- return {
471
- 'Predicciones': prediction_table,
472
- '% Contribución': contribution_table,
473
- 'ANOVA Detallada': detailed_anova_table
474
- }
475
 
476
- def save_figures_to_zip(self):
477
- """
478
- Guarda todas las figuras almacenadas en self.all_figures a un archivo ZIP en memoria.
479
- """
480
- if not self.all_figures:
481
  return None
482
 
483
- zip_buffer = io.BytesIO()
484
- with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
485
- for idx, fig in enumerate(self.all_figures, start=1):
486
- img_bytes = fig.to_image(format="png")
487
- zip_file.writestr(f'Grafico_{idx}.png', img_bytes)
488
- zip_buffer.seek(0)
489
-
490
- # Guardar en un archivo temporal
491
- with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
492
- temp_file.write(zip_buffer.read())
493
- temp_path = temp_file.name
494
-
495
- return temp_path
496
-
497
- def save_fig_to_bytes(self, fig):
498
- """
499
- Convierte una figura Plotly a bytes en formato PNG.
500
- """
501
- return fig.to_image(format="png")
502
-
503
- def save_all_figures_png(self):
504
- """
505
- Guarda todas las figuras en archivos PNG temporales y retorna las rutas.
506
- """
507
- png_paths = []
508
- for idx, fig in enumerate(self.all_figures, start=1):
509
- img_bytes = fig.to_image(format="png")
510
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
511
- temp_file.write(img_bytes)
512
- temp_path = temp_file.name
513
- png_paths.append(temp_path)
514
- return png_paths
515
-
516
- def save_tables_to_excel(self):
517
- """
518
- Guarda todas las tablas en un archivo Excel con múltiples hojas y retorna la ruta del archivo.
519
- """
520
- if 'rsm' not in globals():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  return None
522
 
523
- tables = self.get_all_tables()
524
- excel_buffer = io.BytesIO()
525
- with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
526
- for sheet_name, table in tables.items():
527
- table.to_excel(writer, sheet_name=sheet_name, index=False)
528
- excel_buffer.seek(0)
529
- excel_bytes = excel_buffer.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
 
531
- # Guardar en un archivo temporal
532
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
533
- temp_file.write(excel_bytes)
534
- temp_path = temp_file.name
535
 
536
- return temp_path
 
 
 
537
 
538
- # --- Funciones para la interfaz de Gradio ---
 
 
 
 
539
 
540
- def load_data(x1_name, x2_name, x3_name, y_name, x1_levels_str, x2_levels_str, x3_levels_str, data_str):
541
- """
542
- Carga los datos del diseño Box-Behnken desde cajas de texto y crea la instancia de RSM_BoxBehnken.
543
- """
544
- try:
545
- # Convertir los niveles a listas de números
546
- x1_levels = [float(x.strip()) for x in x1_levels_str.split(',')]
547
- x2_levels = [float(x.strip()) for x in x2_levels_str.split(',')]
548
- x3_levels = [float(x.strip()) for x in x3_levels_str.split(',')]
549
-
550
- # Crear DataFrame a partir de la cadena de datos
551
- data_list = [row.split(',') for row in data_str.strip().split('\n')]
552
- column_names = ['Exp.', x1_name, x2_name, x3_name, y_name]
553
- data = pd.DataFrame(data_list, columns=column_names)
554
- data = data.apply(pd.to_numeric, errors='coerce') # Convertir a numérico
555
-
556
- # Validar que el DataFrame tenga las columnas correctas
557
- if not all(col in data.columns for col in column_names):
558
- raise ValueError("El formato de los datos no es correcto.")
559
-
560
- # Crear la instancia de RSM_BoxBehnken
561
- global rsm
562
- rsm = RSM_BoxBehnken(data, x1_name, x2_name, x3_name, y_name, x1_levels, x2_levels, x3_levels)
563
-
564
- return data.round(3), x1_name, x2_name, x3_name, y_name, x1_levels, x2_levels, x3_levels, gr.update(visible=True)
565
-
566
- except Exception as e:
567
- # Mostrar mensaje de error
568
- error_message = f"Error al cargar los datos: {str(e)}"
569
- print(error_message)
570
- return None, "", "", "", "", [], [], [], gr.update(visible=False)
571
-
572
- def fit_and_optimize_model():
573
- if 'rsm' not in globals():
574
- return [None]*10
575
-
576
- # Ajustar modelos y optimizar
577
- model_completo, pareto_completo = rsm.fit_model()
578
- model_simplificado, pareto_simplificado = rsm.fit_simplified_model()
579
- optimization_table = rsm.optimize()
580
- equation = rsm.get_simplified_equation()
581
- prediction_table = rsm.generate_prediction_table()
582
- contribution_table = rsm.calculate_contribution_percentage()
583
- anova_table = rsm.calculate_detailed_anova()
584
-
585
- # Generar todas las figuras y almacenarlas
586
- rsm.generate_all_plots()
587
-
588
- # Formatear la ecuación para que se vea mejor en Markdown
589
- equation_formatted = equation.replace(" + ", "<br>+ ").replace(" ** ", "^").replace("*", " × ")
590
- equation_formatted = f"### Ecuación del Modelo Simplificado:<br>{equation_formatted}"
591
-
592
- # Guardar las tablas en Excel temporal
593
- excel_path = rsm.save_tables_to_excel()
594
-
595
- # Guardar todas las figuras en un ZIP temporal
596
- zip_path = rsm.save_figures_to_zip()
597
-
598
- return (
599
- model_completo.summary().as_html(),
600
- pareto_completo,
601
- model_simplificado.summary().as_html(),
602
- pareto_simplificado,
603
- equation_formatted,
604
- optimization_table,
605
- prediction_table,
606
- contribution_table,
607
- anova_table,
608
- zip_path, # Ruta del ZIP de gráficos
609
- excel_path # Ruta del Excel de tablas
610
- )
611
 
612
- def show_plot(current_index, all_figures):
613
- if not all_figures:
614
- return None, "No hay gráficos disponibles.", current_index
615
- selected_fig = all_figures[current_index]
616
- plot_info_text = f"Gráfico {current_index + 1} de {len(all_figures)}"
617
- return selected_fig, plot_info_text, current_index
618
 
619
- def navigate_plot(direction, current_index, all_figures):
620
- """
621
- Navega entre los gráficos.
622
- """
623
- if not all_figures:
624
- return None, "No hay gráficos disponibles.", current_index
625
-
626
- if direction == 'left':
627
- new_index = (current_index - 1) % len(all_figures)
628
- elif direction == 'right':
629
- new_index = (current_index + 1) % len(all_figures)
630
- else:
631
- new_index = current_index
632
-
633
- selected_fig = all_figures[new_index]
634
- plot_info_text = f"Gráfico {new_index + 1} de {len(all_figures)}"
635
-
636
- return selected_fig, plot_info_text, new_index
637
 
638
- def download_current_plot(all_figures, current_index):
639
- """
640
- Descarga la figura actual como PNG.
641
- """
642
- if not all_figures:
 
 
 
 
 
643
  return None
644
- fig = all_figures[current_index]
645
- img_bytes = rsm.save_fig_to_bytes(fig)
646
- filename = f"Grafico_RSM_{current_index + 1}.png"
647
-
648
- # Crear un archivo temporal
649
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
650
- temp_file.write(img_bytes)
651
- temp_path = temp_file.name
652
-
653
- return temp_path # Retornar solo la ruta
654
 
655
- def download_all_plots_zip(all_figures):
656
- """
657
- Descarga todas las figuras en un archivo ZIP.
658
- """
659
- if not all_figures:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
  return None
661
- zip_path = rsm.save_figures_to_zip()
662
- filename = f"Graficos_RSM_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
663
- return zip_path # Retornar solo la ruta
664
 
665
- def download_all_tables_excel():
666
- """
667
- Descarga todas las tablas en un archivo Excel con múltiples hojas.
668
- """
669
- if 'rsm' not in globals():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  return None
671
- excel_path = rsm.save_tables_to_excel()
672
- filename = f"Tablas_RSM_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
673
- return excel_path # Retornar solo la ruta
674
 
675
- # --- Crear la interfaz de Gradio ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
- with gr.Blocks() as demo:
678
- gr.Markdown("# Optimización de la producción de AIA usando RSM Box-Behnken")
679
-
680
- with gr.Row():
681
- with gr.Column():
682
- gr.Markdown("## Configuración del Diseño")
683
- x1_name_input = gr.Textbox(label="Nombre de la Variable X1 (ej. Glucosa)", value="Glucosa")
684
- x2_name_input = gr.Textbox(label="Nombre de la Variable X2 (ej. Extracto de Levadura)", value="Extracto_de_Levadura")
685
- x3_name_input = gr.Textbox(label="Nombre de la Variable X3 (ej. Triptófano)", value="Triptofano")
686
- y_name_input = gr.Textbox(label="Nombre de la Variable Dependiente (ej. AIA (ppm))", value="AIA_ppm")
687
- x1_levels_input = gr.Textbox(label="Niveles de X1 (separados por comas)", value="1, 3.5, 5.5")
688
- x2_levels_input = gr.Textbox(label="Niveles de X2 (separados por comas)", value="0.03, 0.2, 0.3")
689
- x3_levels_input = gr.Textbox(label="Niveles de X3 (separados por comas)", value="0.4, 0.65, 0.9")
690
- data_input = gr.Textbox(label="Datos del Experimento (formato CSV)", lines=10, value="""1,-1,-1,0,166.594
691
- 2,1,-1,0,177.557
692
- 3,-1,1,0,127.261
693
- 4,1,1,0,147.573
694
- 5,-1,0,-1,188.883
695
- 6,1,0,-1,224.527
696
- 7,-1,0,1,190.238
697
- 8,1,0,1,226.483
698
- 9,0,-1,-1,195.550
699
- 10,0,1,-1,149.493
700
- 11,0,-1,1,187.683
701
- 12,0,1,1,148.621
702
- 13,0,0,0,278.951
703
- 14,0,0,0,297.238
704
- 15,0,0,0,280.896""")
705
- load_button = gr.Button("Cargar Datos")
706
-
707
- with gr.Column():
708
- gr.Markdown("## Datos Cargados")
709
- data_output = gr.Dataframe(label="Tabla de Datos", interactive=False)
710
-
711
- # Sección de análisis visible solo después de cargar los datos
712
- with gr.Row(visible=False) as analysis_row:
713
- with gr.Column():
714
- fit_button = gr.Button("Ajustar Modelo y Optimizar")
715
- gr.Markdown("**Modelo Completo**")
716
- model_completo_output = gr.HTML()
717
- pareto_completo_output = gr.Plot()
718
- gr.Markdown("**Modelo Simplificado**")
719
- model_simplificado_output = gr.HTML()
720
- pareto_simplificado_output = gr.Plot()
721
- gr.Markdown("**Ecuación del Modelo Simplificado**")
722
- equation_output = gr.HTML()
723
- optimization_table_output = gr.Dataframe(label="Tabla de Optimización", interactive=False)
724
- prediction_table_output = gr.Dataframe(label="Tabla de Predicciones", interactive=False)
725
- contribution_table_output = gr.Dataframe(label="Tabla de % de Contribución", interactive=False)
726
- anova_table_output = gr.Dataframe(label="Tabla ANOVA Detallada", interactive=False)
727
- gr.Markdown("## Descargar Todas las Tablas")
728
- download_excel_button = gr.DownloadButton("Descargar Tablas en Excel")
729
-
730
- with gr.Column():
731
- gr.Markdown("## Generar Gráficos de Superficie de Respuesta")
732
- fixed_variable_input = gr.Dropdown(label="Variable Fija", choices=["Glucosa", "Extracto_de_Levadura", "Triptofano"], value="Glucosa")
733
- fixed_level_input = gr.Slider(label="Nivel de Variable Fija", minimum=-1, maximum=1, step=0.01, value=0.0)
734
- plot_button = gr.Button("Generar Gráficos")
735
- with gr.Row():
736
- left_button = gr.Button("<")
737
- right_button = gr.Button(">")
738
- rsm_plot_output = gr.Plot()
739
- plot_info = gr.Textbox(label="Información del Gráfico", value="Gráfico 1 de 9", interactive=False)
740
- with gr.Row():
741
- download_plot_button = gr.DownloadButton("Descargar Gráfico Actual (PNG)")
742
- download_all_plots_button = gr.DownloadButton("Descargar Todos los Gráficos (ZIP)")
743
- current_index_state = gr.State(0) # Estado para el índice actual
744
- all_figures_state = gr.State([]) # Estado para todas las figuras
745
-
746
- # Cargar datos
747
- load_button.click(
748
- load_data,
749
- inputs=[x1_name_input, x2_name_input, x3_name_input, y_name_input, x1_levels_input, x2_levels_input, x3_levels_input, data_input],
750
- outputs=[data_output, x1_name_input, x2_name_input, x3_name_input, y_name_input, x1_levels_input, x2_levels_input, x3_levels_input, analysis_row]
751
- )
752
-
753
- # Ajustar modelo y optimizar
754
- fit_button.click(
755
- fit_and_optimize_model,
756
- inputs=[],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
  outputs=[
758
- model_completo_output,
759
- pareto_completo_output,
760
- model_simplificado_output,
761
- pareto_simplificado_output,
762
- equation_output,
763
- optimization_table_output,
764
- prediction_table_output,
765
- contribution_table_output,
766
- anova_table_output,
767
- download_all_plots_button,
768
- download_excel_button
769
- ]
770
- )
771
-
772
- # Generar y mostrar los gráficos
773
- plot_button.click(
774
- lambda fixed_var, fixed_lvl: (rsm.plot_rsm_individual(fixed_var, fixed_lvl), "Gráfico 1 de " + str(len(rsm.all_figures)), 0),
775
- inputs=[fixed_variable_input, fixed_level_input],
776
- outputs=[rsm_plot_output, plot_info, current_index_state]
777
- )
778
-
779
- # Navegación de gráficos
780
- left_button.click(
781
- navigate_plot,
782
- inputs=[gr.Button.get_value(left_button), current_index_state, all_figures_state],
783
- outputs=[rsm_plot_output, plot_info, current_index_state]
784
- )
785
- right_button.click(
786
- navigate_plot,
787
- inputs=[gr.Button.get_value(right_button), current_index_state, all_figures_state],
788
- outputs=[rsm_plot_output, plot_info, current_index_state]
789
- )
790
-
791
- # Descargar gráfico actual
792
- download_plot_button.click(
793
- download_current_plot,
794
- inputs=[all_figures_state, current_index_state],
795
- outputs=download_plot_button
796
- )
797
-
798
- # Descargar todos los gráficos en ZIP
799
- download_all_plots_button.click(
800
- download_all_plots_zip,
801
- inputs=[all_figures_state],
802
- outputs=download_all_plots_button
803
- )
804
-
805
- # Descargar todas las tablas en Excel
806
- download_excel_button.click(
807
- download_all_tables_excel,
808
- inputs=[],
809
- outputs=download_excel_button
810
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
811
 
812
- # Ejemplo de uso
813
- gr.Markdown("## Ejemplo de uso")
814
- gr.Markdown("""
815
- 1. Introduce los nombres de las variables y sus niveles en las cajas de texto correspondientes.
816
- 2. Copia y pega los datos del experimento en la caja de texto 'Datos del Experimento'.
817
- 3. Haz clic en 'Cargar Datos' para cargar los datos en la tabla.
818
- 4. Haz clic en 'Ajustar Modelo y Optimizar' para ajustar el modelo y encontrar los niveles óptimos de los factores.
819
- 5. Selecciona una variable fija y su nivel en los controles deslizantes.
820
- 6. Haz clic en 'Generar Gráficos' para generar los gráficos de superficie de respuesta.
821
- 7. Navega entre los gráficos usando los botones '<' y '>'.
822
- 8. Descarga el gráfico actual en PNG o descarga todos los gráficos en un ZIP.
823
- 9. Descarga todas las tablas en un archivo Excel con el botón correspondiente.
824
- """)
825
-
826
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import logging
5
+ import zipfile
6
+ import requests
7
+ import bibtexparser
8
+ from tqdm import tqdm
9
+ from urllib.parse import quote, urlencode
10
  import gradio as gr
11
+ from bs4 import BeautifulSoup
12
  import io
13
+ import asyncio
14
+ import aiohttp
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s: %(message)s')
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class PaperDownloader:
23
+ def __init__(self, output_dir='papers'):
24
+ self.output_dir = output_dir
25
+ os.makedirs(output_dir, exist_ok=True)
26
+
27
+ # Updated download sources
28
+ self.download_sources = [
29
+ 'https://sci-hub.ee/',
30
+ 'https://sci-hub.st/',
31
+ 'https://sci-hub.ru/',
32
+ 'https://sci-hub.ren/',
33
+ 'https://sci-hub.mksa.top/',
34
+ 'https://sci-hub.se/',
35
+ 'https://libgen.rs/scimag/'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  ]
37
 
38
+ # Request headers
39
+ self.headers = {
40
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
41
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
+ 'Accept-Language': 'en-US,en;q=0.9',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
 
45
+ def clean_doi(self, doi):
46
+ """Clean and encode DOI for URL"""
47
+ if not isinstance(doi, str):
48
+ return None
49
+ return quote(doi.strip()) if doi else None
50
+
51
+ async def fetch_with_headers(self, session, url, timeout=10):
52
+ """Utility method to fetch an URL with headers and timeout"""
53
+ try:
54
+ async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
+ response.raise_for_status()
56
+ return await response.text(), response.headers
57
+ except Exception as e:
58
+ logger.debug(f"Error fetching {url}: {e}")
59
+ return None, None
60
+
61
+
62
+ async def download_paper_direct_doi_async(self, session, doi):
63
+ """Attempt to download the pdf from the landing page of the doi"""
64
+ if not doi:
65
+ return None
66
+
67
+ try:
68
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
+ text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
+ if not text:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  return None
72
 
73
+ pdf_patterns = [
74
+ r'(https?://[^\s<>"]+?\.pdf)',
75
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
+ ]
78
+
79
+ pdf_urls = []
80
+ for pattern in pdf_patterns:
81
+ pdf_urls.extend(re.findall(pattern, text))
82
+
83
+ for pdf_url in pdf_urls:
84
+ try:
85
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
+ logger.debug(f"Found PDF from: {pdf_url}")
88
+ return await pdf_response.read()
89
+ except Exception as e:
90
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
+
92
+
93
+ except Exception as e:
94
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
+
96
+ return None
97
+
98
+ async def download_paper_scihub_async(self, session, doi):
99
+ """Improved method to download paper from Sci-Hub using async requests"""
100
+ if not doi:
101
+ logger.warning("DOI not provided")
102
  return None
103
 
104
+ for base_url in self.download_sources:
105
+ try:
106
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
107
+ text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
108
+ if not text:
109
+ continue
110
+
111
+ # Search for multiple PDF URL patterns
112
+ pdf_patterns = [
113
+ r'(https?://[^\s<>"]+?\.pdf)',
114
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
115
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
116
+ ]
117
+
118
+ pdf_urls = []
119
+ for pattern in pdf_patterns:
120
+ pdf_urls.extend(re.findall(pattern, text))
121
+
122
+ # Try downloading from found URLs
123
+ for pdf_url in pdf_urls:
124
+ try:
125
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
126
+ # Verify if it's a PDF
127
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
128
+ logger.debug(f"Found PDF from: {pdf_url}")
129
+ return await pdf_response.read()
130
+ except Exception as e:
131
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
132
+
133
+ except Exception as e:
134
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
135
 
136
+ return None
137
 
138
+ async def download_paper_libgen_async(self, session, doi):
139
+ """Download from Libgen, handles the query and the redirection"""
140
+ if not doi:
 
 
 
141
  return None
142
 
143
+ base_url = 'https://libgen.rs/scimag/'
144
+ try:
145
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
146
+ text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ if not text or "No results" in text:
149
+ logger.debug(f"No results for DOI: {doi} on libgen")
150
+ return None
 
 
151
 
152
+ soup = BeautifulSoup(text, 'html.parser')
 
153
 
154
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
 
155
 
156
+ if links:
157
+ link = links[0]
158
+ pdf_url = link['href']
159
+ pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
160
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
161
+ logger.debug(f"Found PDF from: {pdf_url}")
162
+ return await pdf_response.read()
163
+ except Exception as e:
164
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
165
+ return None
166
 
167
+ async def download_paper_google_scholar_async(self, session, doi):
168
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
169
+ if not doi:
170
+ return None
171
 
172
+ try:
173
+ query = f'doi:"{doi}"'
174
+ params = {'q': query}
175
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
176
 
177
+ text, headers = await self.fetch_with_headers(session, url, timeout=10)
178
+ if not text:
179
+ return None
180
 
181
+ soup = BeautifulSoup(text, 'html.parser')
182
+
183
+ # Find any links with [PDF]
184
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
185
+
186
+ if links:
187
+ pdf_url = links[0]['href']
188
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
189
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
190
+ logger.debug(f"Found PDF from: {pdf_url}")
191
+ return await pdf_response.read()
192
+ except Exception as e:
193
+ logger.debug(f"Google Scholar error for {doi}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ return None
196
+
197
+ async def download_paper_crossref_async(self, session, doi):
198
+ """Alternative search method using Crossref"""
199
+ if not doi:
200
  return None
201
 
202
+ try:
203
+ # Search for open access link
204
+ url = f"https://api.crossref.org/works/{doi}"
205
+ response = await session.get(url, headers=self.headers, timeout=10)
206
+
207
+ if response.status == 200:
208
+ data = await response.json()
209
+ work = data.get('message', {})
210
+
211
+ # Search for open access links
212
+ links = work.get('link', [])
213
+ for link in links:
214
+ if link.get('content-type') == 'application/pdf':
215
+ pdf_url = link.get('URL')
216
+ if pdf_url:
217
+ pdf_response = await session.get(pdf_url, headers=self.headers)
218
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
219
+ logger.debug(f"Found PDF from: {pdf_url}")
220
+ return await pdf_response.read()
221
+
222
+ except Exception as e:
223
+ logger.debug(f"Crossref error for {doi}: {e}")
224
+
225
+ return None
226
+
227
+ async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
228
+ """Downloads a paper using multiple strategies with exponential backoff and async requests"""
229
+ pdf_content = None
230
+ retries = 0
231
+ delay = initial_delay
232
+
233
+ async with aiohttp.ClientSession() as session:
234
+ while retries < max_retries and not pdf_content:
235
+ try:
236
+ pdf_content = (
237
+ await self.download_paper_direct_doi_async(session, doi) or
238
+ await self.download_paper_scihub_async(session, doi) or
239
+ await self.download_paper_libgen_async(session, doi) or
240
+ await self.download_paper_google_scholar_async(session, doi) or
241
+ await self.download_paper_crossref_async(session, doi)
242
+
243
+ )
244
+ if pdf_content:
245
+ return pdf_content
246
+ except Exception as e:
247
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
248
+
249
+ if not pdf_content:
250
+ retries += 1
251
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
252
+ await asyncio.sleep(delay)
253
+ delay *= 2 # Exponential backoff
254
+
255
+ return None
256
+
257
+ def download_paper_scihub(self, doi):
258
+ """Improved method to download paper from Sci-Hub"""
259
+ if not doi:
260
+ logger.warning("DOI not provided")
261
  return None
262
 
263
+ for base_url in self.download_sources:
264
+ try:
265
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
266
+
267
+ # Request with more tolerance
268
+ response = requests.get(scihub_url,
269
+ headers=self.headers,
270
+ allow_redirects=True,
271
+ timeout=15)
272
+
273
+ # Search for multiple PDF URL patterns
274
+ pdf_patterns = [
275
+ r'(https?://[^\s<>"]+?\.pdf)',
276
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
277
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
278
+ ]
279
+
280
+ pdf_urls = []
281
+ for pattern in pdf_patterns:
282
+ pdf_urls.extend(re.findall(pattern, response.text))
283
+
284
+ # Try downloading from found URLs
285
+ for pdf_url in pdf_urls:
286
+ try:
287
+ pdf_response = requests.get(pdf_url,
288
+ headers=self.headers,
289
+ timeout=10)
290
+
291
+ # Verify if it's a PDF
292
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
293
+ logger.debug(f"Found PDF from: {pdf_url}")
294
+ return pdf_response.content
295
+ except Exception as e:
296
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
297
+
298
+ except Exception as e:
299
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
300
 
301
+ return None
 
 
 
302
 
303
+ def download_paper_libgen(self, doi):
304
+ """Download from Libgen, handles the query and the redirection"""
305
+ if not doi:
306
+ return None
307
 
308
+ base_url = 'https://libgen.rs/scimag/'
309
+ try:
310
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
311
+ response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
312
+ response.raise_for_status()
313
 
314
+ if "No results" in response.text:
315
+ logger.debug(f"No results for DOI: {doi} on libgen")
316
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
+ soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
 
319
 
320
+ # Find the link using a specific selector
321
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
+ if links:
324
+ link = links[0]
325
+ pdf_url = link['href']
326
+ pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
327
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
328
+ logger.debug(f"Found PDF from: {pdf_url}")
329
+ return pdf_response.content
330
+
331
+ except Exception as e:
332
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
333
  return None
 
 
 
 
 
 
 
 
 
 
334
 
335
+ def download_paper_google_scholar(self, doi):
336
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
337
+ if not doi:
338
+ return None
339
+
340
+ try:
341
+ query = f'doi:"{doi}"'
342
+ params = {'q': query}
343
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
344
+
345
+ response = requests.get(url, headers=self.headers, timeout=10)
346
+ response.raise_for_status()
347
+
348
+ soup = BeautifulSoup(response.text, 'html.parser')
349
+
350
+ # Find any links with [PDF]
351
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
352
+
353
+ if links:
354
+ pdf_url = links[0]['href']
355
+ pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
356
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
357
+ logger.debug(f"Found PDF from: {pdf_url}")
358
+ return pdf_response.content
359
+ except Exception as e:
360
+ logger.debug(f"Google Scholar error for {doi}: {e}")
361
+
362
  return None
 
 
 
363
 
364
+ def download_paper_crossref(self, doi):
365
+ """Alternative search method using Crossref"""
366
+ if not doi:
367
+ return None
368
+
369
+ try:
370
+ # Search for open access link
371
+ url = f"https://api.crossref.org/works/{doi}"
372
+ response = requests.get(url, headers=self.headers, timeout=10)
373
+
374
+ if response.status_code == 200:
375
+ data = response.json()
376
+ work = data.get('message', {})
377
+
378
+ # Search for open access links
379
+ links = work.get('link', [])
380
+ for link in links:
381
+ if link.get('content-type') == 'application/pdf':
382
+ pdf_url = link.get('URL')
383
+ if pdf_url:
384
+ pdf_response = requests.get(pdf_url, headers=self.headers)
385
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
386
+ logger.debug(f"Found PDF from: {pdf_url}")
387
+ return pdf_response.content
388
+
389
+ except Exception as e:
390
+ logger.debug(f"Crossref error for {doi}: {e}")
391
+
392
  return None
 
 
 
393
 
394
+ def download_with_retry(self, doi, max_retries=3, initial_delay=2):
395
+ """Downloads a paper using multiple strategies with exponential backoff"""
396
+ pdf_content = None
397
+ retries = 0
398
+ delay = initial_delay
399
+
400
+ while retries < max_retries and not pdf_content:
401
+ try:
402
+ pdf_content = (
403
+ self.download_paper_scihub(doi) or
404
+ self.download_paper_libgen(doi) or
405
+ self.download_paper_google_scholar(doi) or
406
+ self.download_paper_crossref(doi)
407
+
408
+ )
409
+
410
+ if pdf_content:
411
+ return pdf_content
412
+ except Exception as e:
413
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
414
+
415
+ if not pdf_content:
416
+ retries += 1
417
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
418
+ time.sleep(delay)
419
+ delay *= 2 # Exponential backoff
420
 
421
+ return None
422
+
423
+ def download_single_doi(self, doi):
424
+ """Downloads a single paper using a DOI"""
425
+ if not doi:
426
+ return None, "Error: DOI not provided", "Error: DOI not provided"
427
+
428
+ try:
429
+ pdf_content = self.download_with_retry(doi)
430
+
431
+ if pdf_content:
432
+ if doi is None:
433
+ return None, "Error: DOI not provided", "Error: DOI not provided"
434
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
435
+ filepath = os.path.join(self.output_dir, filename)
436
+ with open(filepath, 'wb') as f:
437
+ f.write(pdf_content)
438
+ logger.info(f"Successfully downloaded: {filename}")
439
+ return filepath, f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>', ""
440
+ else:
441
+ logger.warning(f"Could not download: {doi}")
442
+ return None, f"Could not download {doi}", f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>'
443
+
444
+ except Exception as e:
445
+ logger.error(f"Error processing {doi}: {e}")
446
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
447
+
448
+ def download_multiple_dois(self, dois_text):
449
+ """Downloads multiple papers from a list of DOIs"""
450
+ if not dois_text:
451
+ return None, "Error: No DOIs provided", "Error: No DOIs provided"
452
+
453
+ dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
454
+ if not dois:
455
+ return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
456
+
457
+ downloaded_files = []
458
+ failed_dois = []
459
+ downloaded_links = []
460
+ for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
461
+ filepath, success_message, fail_message = self.download_single_doi(doi)
462
+ if filepath:
463
+ # Unique filename for zip
464
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
465
+ filepath_unique = os.path.join(self.output_dir, filename)
466
+ os.rename(filepath, filepath_unique)
467
+ downloaded_files.append(filepath_unique)
468
+ downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
469
+
470
+ else:
471
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
472
+
473
+ if downloaded_files:
474
+ zip_filename = 'papers.zip'
475
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
476
+ for file_path in downloaded_files:
477
+ zipf.write(file_path, arcname=os.path.basename(file_path))
478
+ logger.info(f"ZIP file created: {zip_filename}")
479
+
480
+ return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
481
+
482
+ def process_bibtex(self, bib_file):
483
+ """Process BibTeX file and download papers with multiple strategies"""
484
+ # Read BibTeX file content from the uploaded object
485
+ try:
486
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
487
+ bib_content = f.read()
488
+ except Exception as e:
489
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
490
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
491
+
492
+ # Parse BibTeX data
493
+ try:
494
+ bib_database = bibtexparser.loads(bib_content)
495
+ except Exception as e:
496
+ logger.error(f"Error parsing BibTeX data: {e}")
497
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
498
+
499
+ # Extract DOIs
500
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
501
+ logger.info(f"Found {len(dois)} DOIs to download")
502
+
503
+ # Result lists
504
+ downloaded_files = []
505
+ failed_dois = []
506
+ downloaded_links = []
507
+
508
+ # Download PDFs
509
+ for doi in tqdm(dois, desc="Downloading papers"):
510
+ try:
511
+ # Try to download with multiple methods with retries
512
+ pdf_content = self.download_with_retry(doi)
513
+
514
+ # Save PDF
515
+ if pdf_content:
516
+ if doi is None:
517
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
518
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
519
+ filepath = os.path.join(self.output_dir, filename)
520
+
521
+ with open(filepath, 'wb') as f:
522
+ f.write(pdf_content)
523
+
524
+ downloaded_files.append(filepath)
525
+ downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
526
+ logger.info(f"Successfully downloaded: {filename}")
527
+ else:
528
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
529
+
530
+ except Exception as e:
531
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
532
+ logger.error(f"Error processing {doi}: {e}")
533
+
534
+ # Create ZIP of downloaded papers
535
+ if downloaded_files:
536
+ zip_filename = 'papers.zip'
537
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
538
+ for file_path in downloaded_files:
539
+ zipf.write(file_path, arcname=os.path.basename(file_path))
540
+ logger.info(f"ZIP file created: {zip_filename}")
541
+
542
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
543
+
544
+ async def process_bibtex_async(self, bib_file):
545
+ """Process BibTeX file and download papers with multiple strategies"""
546
+ # Read BibTeX file content from the uploaded object
547
+ try:
548
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
549
+ bib_content = f.read()
550
+ except Exception as e:
551
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
552
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
553
+
554
+ # Parse BibTeX data
555
+ try:
556
+ bib_database = bibtexparser.loads(bib_content)
557
+ except Exception as e:
558
+ logger.error(f"Error parsing BibTeX data: {e}")
559
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
560
+
561
+ # Extract DOIs
562
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
563
+ logger.info(f"Found {len(dois)} DOIs to download")
564
+
565
+ # Result lists
566
+ downloaded_files = []
567
+ failed_dois = []
568
+ downloaded_links = []
569
+
570
+ # Download PDFs
571
+ for doi in tqdm(dois, desc="Downloading papers"):
572
+ try:
573
+ # Try to download with multiple methods with retries
574
+ pdf_content = await self.download_with_retry_async(doi)
575
+
576
+ # Save PDF
577
+ if pdf_content:
578
+ if doi is None:
579
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
580
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
581
+ filepath = os.path.join(self.output_dir, filename)
582
+
583
+ with open(filepath, 'wb') as f:
584
+ f.write(pdf_content)
585
+
586
+ downloaded_files.append(filepath)
587
+ downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
588
+ logger.info(f"Successfully downloaded: {filename}")
589
+ else:
590
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
591
+
592
+ except Exception as e:
593
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
594
+ logger.error(f"Error processing {doi}: {e}")
595
+
596
+ # Create ZIP of downloaded papers
597
+ if downloaded_files:
598
+ zip_filename = 'papers.zip'
599
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
600
+ for file_path in downloaded_files:
601
+ zipf.write(file_path, arcname=os.path.basename(file_path))
602
+ logger.info(f"ZIP file created: {zip_filename}")
603
+
604
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
605
+
606
+ def create_gradio_interface():
607
+ """Create Gradio interface for Paper Downloader"""
608
+ downloader = PaperDownloader()
609
+
610
+ async def download_papers(bib_file, doi_input, dois_input):
611
+ if bib_file:
612
+ # Check file type
613
+ if not bib_file.name.lower().endswith('.bib'):
614
+ return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
615
+
616
+ zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file)
617
+ return zip_path, downloaded_dois, failed_dois, None
618
+ elif doi_input:
619
+ filepath, message, failed_doi = downloader.download_single_doi(doi_input)
620
+ return None, message, failed_doi, filepath
621
+ elif dois_input:
622
+ zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
623
+ return zip_path, downloaded_dois, failed_dois, None
624
+ else:
625
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
626
+
627
+ # Gradio Interface
628
+ interface = gr.Interface(
629
+ fn=download_papers,
630
+ inputs=[
631
+ gr.File(file_types=['.bib'], label="Upload BibTeX File"),
632
+ gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
633
+ gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
634
+ ],
635
  outputs=[
636
+ gr.File(label="Download Papers (ZIP) or Single PDF"),
637
+ gr.HTML(label="""
638
+ <div style='padding-bottom: 5px; font-weight: bold;'>
639
+ Found DOIs
640
+ </div>
641
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
642
+ <div id="downloaded-dois"></div>
643
+ </div>
644
+ """),
645
+ gr.HTML(label="""
646
+ <div style='padding-bottom: 5px; font-weight: bold;'>
647
+ Missed DOIs
648
+ </div>
649
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
650
+ <div id="failed-dois"></div>
651
+ </div>
652
+ """),
653
+ gr.File(label="Downloaded Single PDF")
654
+ ],
655
+ title="🔬 Academic Paper Batch Downloader",
656
+ description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
657
+ theme="Hev832/Applio",
658
+ examples=[
659
+ ["example.bib", None, None], # Bibtex File
660
+ [None, "10.1038/nature12373", None], # Single DOI
661
+ [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
662
+ ],
663
+ css="""
664
+ .gradio-container {
665
+ background-color: black;
666
+ }
667
+ .gr-interface {
668
+ max-width: 800px;
669
+ margin: 0 auto;
670
+ }
671
+ .gr-box {
672
+ background-color: black;
673
+ border-radius: 10px;
674
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
675
+ }
676
+ .output-text a {
677
+ color: #007bff; /* Blue color for hyperlinks */
678
+ }
679
+ """,
680
+ cache_examples=False,
 
 
 
 
 
 
 
681
  )
682
+
683
+ # Add Javascript to update HTML
684
+ interface.load = """
685
+ function(downloaded_dois, failed_dois) {
686
+ let downloaded_html = '';
687
+ downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
688
+ downloaded_html += doi + '<br>';
689
+ });
690
+ document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
691
+ let failed_html = '';
692
+ failed_dois.split('\\n').filter(Boolean).forEach(doi => {
693
+ failed_html += doi + '<br>';
694
+ });
695
+ document.querySelector("#failed-dois").innerHTML = failed_html;
696
+ return [downloaded_html, failed_html];
697
+ }
698
+ """
699
 
700
+ interface.head = """
701
+ <script>
702
+ function copyLink(button) {
703
+ const linkElement = button.previousElementSibling;
704
+ const link = linkElement.href;
705
+ navigator.clipboard.writeText(link)
706
+ .then(() => {
707
+ button.innerText = '✓ Copied';
708
+ button.style.color = 'green';
709
+ setTimeout(() => {
710
+ button.innerText = 'Copy';
711
+ button.style.color = '';
712
+ }, 2000);
713
+ })
714
+ .catch(err => {
715
+ console.error('Failed to copy link: ', err);
716
+ });
717
+ }
718
+ </script>
719
+ """
720
+ return interface
721
+
722
+
723
+ def main():
724
+ interface = create_gradio_interface()
725
+ interface.launch(share=True)
726
+
727
+
728
+ if __name__ == "__main__":
729
+ main()