Spaces:

histlearn
/

BoletimSed

Sleeping

App Files Files Community

histlearn commited on Oct 27, 2024

Commit

b34943b

verified ·

1 Parent(s): 9eb6463

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -34

app.py CHANGED Viewed

@@ -121,81 +121,103 @@ def calcular_frequencia_media(frequencias):
     return sum(freq_validas) / len(freq_validas)
 def extrair_tabelas_pdf(pdf_path):
-    """Extrai tabelas do PDF usando stream para cabeçalho e lattice para notas."""
     try:
-        # Extrair cabeçalho usando stream (melhor para texto)
         tables_header = camelot.read_pdf(
             pdf_path,
             pages='1',
             flavor='stream',
-            table_areas=['0,700,600,850']  # Ajuste esta área conforme necessário
         )
-        info_aluno = {}
-        print("Extraindo informações do cabeçalho...")
-        # Processar tabelas do cabeçalho
         for table in tables_header:
             df = table.df
-            print("\nConteúdo da tabela:")
             print(df)
             for i in range(len(df)):
-                row = df.iloc[i]
-                for j in range(len(row)):
-                    texto = str(row[j]).strip()
-                    if 'Nome do Aluno:' in texto:
                         try:
-                            nome = str(df.iloc[i, j+1]).strip() if j+1 < len(row) else str(df.iloc[i+1, 0]).strip()
-                            if nome and not nome.startswith('Nome'):
                                 info_aluno['nome'] = nome
                                 print(f"Nome encontrado: {nome}")
                         except:
-                            pass
-                    elif 'RA:' in texto:
                         try:
-                            ra = str(df.iloc[i, j+1]).strip() if j+1 < len(row) else str(df.iloc[i+1, 0]).strip()
-                            if ra and not ra.startswith('RA'):
                                 info_aluno['ra'] = ra
                                 print(f"RA encontrado: {ra}")
                         except:
-                            pass
-                    elif 'Escola:' in texto:
                         try:
-                            escola = str(df.iloc[i, j+1]).strip() if j+1 < len(row) else str(df.iloc[i+1, 0]).strip()
-                            if escola and not escola.startswith('Escola'):
                                 info_aluno['escola'] = escola
                                 print(f"Escola encontrada: {escola}")
                         except:
-                            pass
-                    elif 'Turma:' in texto:
                         try:
-                            turma = str(df.iloc[i, j+1]).strip() if j+1 < len(row) else str(df.iloc[i+1, 0]).strip()
-                            if turma and not turma.startswith('Turma'):
                                 info_aluno['turma'] = turma
                                 print(f"Turma encontrada: {turma}")
                         except:
-                            pass
-        # Extrair tabela de notas usando lattice (melhor para tabelas estruturadas)
         tables_notas = camelot.read_pdf(
             pdf_path,
             pages='all',
             flavor='lattice'
         )
-        print(f"\nTabelas de notas extraídas: {len(tables_notas)}")
-        # Encontrar a tabela de notas
         df_notas = None
         for table in tables_notas:
             df_temp = table.df
-            if 'Disciplina' in str(df_temp.iloc[0,0]):
-                df_notas = df_temp
                 df_notas = df_notas.rename(columns={
                     0: 'Disciplina',
                     1: 'Nota B1', 2: 'Freq B1', 3: '%Freq B1', 4: 'AC B1',
@@ -204,7 +226,6 @@ def extrair_tabelas_pdf(pdf_path):
                     13: 'Nota B4', 14: 'Freq B4', 15: '%Freq B4', 16: 'AC B4',
                     17: 'CF', 18: 'Nota Final', 19: 'Freq Final', 20: 'AC Final'
                 })
-                break
         if df_notas is None:
             raise ValueError("Tabela de notas não encontrada")
@@ -219,8 +240,37 @@ def extrair_tabelas_pdf(pdf_path):
         return df_notas
     except Exception as e:
-        print(f"Erro na extração das tabelas: {str(e)}")
-        raise
 def obter_disciplinas_validas(df):
     """Identifica disciplinas válidas no boletim com seus dados."""

     return sum(freq_validas) / len(freq_validas)
 def extrair_tabelas_pdf(pdf_path):
+    """Extrai tabelas do PDF usando múltiplas abordagens."""
     try:
+        info_aluno = {}
+        print("Iniciando extração de informações...")
+        # Primeira tentativa: Extrair toda a primeira página com stream
         tables_header = camelot.read_pdf(
             pdf_path,
             pages='1',
             flavor='stream',
+            edge_tol=500  # Aumentar tolerância para detectar bordas
         )
+        print(f"Tabelas encontradas na primeira tentativa: {len(tables_header)}")
+        # Processar todas as tabelas encontradas
         for table in tables_header:
             df = table.df
+            print("\nAnalisando tabela:")
             print(df)
+            # Procurar em cada célula da tabela
             for i in range(len(df)):
+                for j in range(len(df.columns)):
+                    texto = str(df.iloc[i,j]).strip()
+                    # Nome do Aluno
+                    if 'Nome do Aluno' in texto:
+                        # Tentar diferentes posições para o valor
                         try:
+                            if j + 1 < len(df.columns):
+                                nome = str(df.iloc[i,j+1]).strip()
+                            elif i + 1 < len(df):
+                                nome = str(df.iloc[i+1,j]).strip()
+                            if nome and nome != 'Nome do Aluno:':
                                 info_aluno['nome'] = nome
                                 print(f"Nome encontrado: {nome}")
                         except:
+                            continue
+                    # RA
+                    elif 'RA' in texto and len(texto) < 5:  # Para evitar falsos positivos
                         try:
+                            if j + 1 < len(df.columns):
+                                ra = str(df.iloc[i,j+1]).strip()
+                            elif i + 1 < len(df):
+                                ra = str(df.iloc[i+1,j]).strip()
+                            if ra and ra != 'RA:':
                                 info_aluno['ra'] = ra
                                 print(f"RA encontrado: {ra}")
                         except:
+                            continue
+                    # Escola
+                    elif 'Escola' in texto:
                         try:
+                            if j + 1 < len(df.columns):
+                                escola = str(df.iloc[i,j+1]).strip()
+                            elif i + 1 < len(df):
+                                escola = str(df.iloc[i+1,j]).strip()
+                            if escola and escola != 'Escola:':
                                 info_aluno['escola'] = escola
                                 print(f"Escola encontrada: {escola}")
                         except:
+                            continue
+                    # Turma
+                    elif 'Turma' in texto:
                         try:
+                            if j + 1 < len(df.columns):
+                                turma = str(df.iloc[i,j+1]).strip()
+                            elif i + 1 < len(df):
+                                turma = str(df.iloc[i+1,j]).strip()
+                            if turma and turma != 'Turma:':
                                 info_aluno['turma'] = turma
                                 print(f"Turma encontrada: {turma}")
                         except:
+                            continue
+        # Segunda parte: Extrair tabela de notas usando lattice
         tables_notas = camelot.read_pdf(
             pdf_path,
             pages='all',
             flavor='lattice'
         )
+        print(f"\nTabelas de notas encontradas: {len(tables_notas)}")
+        # Encontrar tabela de notas (procurar a maior tabela com 'Disciplina')
         df_notas = None
+        max_rows = 0
         for table in tables_notas:
             df_temp = table.df
+            if len(df_temp) > max_rows and 'Disciplina' in str(df_temp.iloc[0,0]):
+                max_rows = len(df_temp)
+                df_notas = df_temp.copy()
                 df_notas = df_notas.rename(columns={
                     0: 'Disciplina',
                     1: 'Nota B1', 2: 'Freq B1', 3: '%Freq B1', 4: 'AC B1',
                     13: 'Nota B4', 14: 'Freq B4', 15: '%Freq B4', 16: 'AC B4',
                     17: 'CF', 18: 'Nota Final', 19: 'Freq Final', 20: 'AC Final'
                 })
         if df_notas is None:
             raise ValueError("Tabela de notas não encontrada")
         return df_notas
     except Exception as e:
+        print(f"Erro detalhado na extração: {str(e)}")
+        print("Tentando abordagem alternativa...")
+        try:
+            # Tentativa alternativa usando apenas lattice
+            tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')
+            if len(tables) > 0:
+                df = tables[0].df
+                df_notas = None
+                for table in tables:
+                    if 'Disciplina' in str(table.df.iloc[0,0]):
+                        df_notas = table.df
+                        df_notas = df_notas.rename(columns={
+                            0: 'Disciplina',
+                            1: 'Nota B1', 2: 'Freq B1', 3: '%Freq B1', 4: 'AC B1',
+                            5: 'Nota B2', 6: 'Freq B2', 7: '%Freq B2', 8: 'AC B2',
+                            9: 'Nota B3', 10: 'Freq B3', 11: '%Freq B3', 12: 'AC B3',
+                            13: 'Nota B4', 14: 'Freq B4', 15: '%Freq B4', 16: 'AC B4',
+                            17: 'CF', 18: 'Nota Final', 19: 'Freq Final', 20: 'AC Final'
+                        })
+                        break
+                if df_notas is not None:
+                    return df_notas
+            raise ValueError("Não foi possível extrair as tabelas em nenhuma tentativa")
+        except Exception as e2:
+            print(f"Erro na tentativa alternativa: {str(e2)}")
+            raise
 def obter_disciplinas_validas(df):
     """Identifica disciplinas válidas no boletim com seus dados."""