Spaces:

histlearn
/

BoletimSed

Sleeping

App Files Files Community

histlearn commited on Oct 27, 2024

Commit

9eb6463

verified ·

1 Parent(s): 50785d0

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -55

app.py CHANGED Viewed

@@ -121,62 +121,81 @@ def calcular_frequencia_media(frequencias):
     return sum(freq_validas) / len(freq_validas)
 def extrair_tabelas_pdf(pdf_path):
-    """Extrai tabelas do PDF e retorna um DataFrame processado."""
     try:
-        # Extrair todas as tabelas
-        tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')
-        print(f"Tabelas extraídas: {len(tables)}")
-        if len(tables) == 0:
-            raise ValueError("Nenhuma tabela foi extraída do PDF.")
         info_aluno = {}
-        # Primeira tabela deve conter as informações do aluno
-        primeira_tabela = tables[0].df
-        # Iterar por cada linha da primeira tabela
-        for i in range(len(primeira_tabela)):
-            linha_atual = primeira_tabela.iloc[i].astype(str)
-            linha_seguinte = primeira_tabela.iloc[i + 1].astype(str) if i + 1 < len(primeira_tabela) else None
-            # Procurar cada informação específica
-            for col in range(len(linha_atual)):
-                valor = str(linha_atual[col]).strip()
-                if "Nome do Aluno:" in valor and linha_seguinte is not None:
-                    nome = str(linha_seguinte[col]).strip()
-                    if nome and nome != "Nome do Aluno:":
-                        info_aluno['nome'] = nome
-                        print(f"Nome encontrado: {nome}")
-                elif "RA:" in valor and linha_seguinte is not None:
-                    ra = str(linha_seguinte[col]).strip()
-                    if ra and ra != "RA:":
-                        info_aluno['ra'] = ra
-                        print(f"RA encontrado: {ra}")
-                elif "Escola:" in valor and linha_seguinte is not None:
-                    escola = str(linha_seguinte[col]).strip()
-                    if escola and escola != "Escola:":
-                        info_aluno['escola'] = escola
-                        print(f"Escola encontrada: {escola}")
-                elif "Turma:" in valor and linha_seguinte is not None:
-                    turma = str(linha_seguinte[col]).strip()
-                    if turma and turma != "Turma:":
-                        info_aluno['turma'] = turma
-                        print(f"Turma encontrada: {turma}")
-        # Encontrar a tabela de notas (geralmente a maior tabela)
         df_notas = None
-        maior_tabela = 0
-        for idx, table in enumerate(tables):
             df_temp = table.df
-            if len(df_temp) > maior_tabela and 'Disciplina' in str(df_temp.iloc[0,0]):
-                maior_tabela = len(df_temp)
-                df_notas = df_temp.copy()
                 df_notas = df_notas.rename(columns={
                     0: 'Disciplina',
                     1: 'Nota B1', 2: 'Freq B1', 3: '%Freq B1', 4: 'AC B1',
@@ -185,6 +204,7 @@ def extrair_tabelas_pdf(pdf_path):
                     13: 'Nota B4', 14: 'Freq B4', 15: '%Freq B4', 16: 'AC B4',
                     17: 'CF', 18: 'Nota Final', 19: 'Freq Final', 20: 'AC Final'
                 })
         if df_notas is None:
             raise ValueError("Tabela de notas não encontrada")
@@ -192,15 +212,10 @@ def extrair_tabelas_pdf(pdf_path):
         # Adicionar informações do aluno ao DataFrame
         df_notas.attrs.update(info_aluno)
-        # Debug: mostrar todas as informações encontradas
-        print("\nInformações do aluno encontradas:")
         for campo, valor in info_aluno.items():
             print(f"{campo}: {valor}")
-        # Debug: mostrar primeira tabela para verificação
-        print("\nPrimeira tabela (para debug):")
-        print(primeira_tabela)
         return df_notas
     except Exception as e:

     return sum(freq_validas) / len(freq_validas)
 def extrair_tabelas_pdf(pdf_path):
+    """Extrai tabelas do PDF usando stream para cabeçalho e lattice para notas."""
     try:
+        # Extrair cabeçalho usando stream (melhor para texto)
+        tables_header = camelot.read_pdf(
+            pdf_path,
+            pages='1',
+            flavor='stream',
+            table_areas=['0,700,600,850']  # Ajuste esta área conforme necessário
+        )
         info_aluno = {}
+        print("Extraindo informações do cabeçalho...")
+        # Processar tabelas do cabeçalho
+        for table in tables_header:
+            df = table.df
+            print("\nConteúdo da tabela:")
+            print(df)
+            for i in range(len(df)):
+                row = df.iloc[i]
+                for j in range(len(row)):
+                    texto = str(row[j]).strip()
+                    if 'Nome do Aluno:' in texto:
+                        try:
+                            nome = str(df.iloc[i, j+1]).strip() if j+1 < len(row) else str(df.iloc[i+1, 0]).strip()
+                            if nome and not nome.startswith('Nome'):
+                                info_aluno['nome'] = nome
+                                print(f"Nome encontrado: {nome}")
+                        except:
+                            pass
+                    elif 'RA:' in texto:
+                        try:
+                            ra = str(df.iloc[i, j+1]).strip() if j+1 < len(row) else str(df.iloc[i+1, 0]).strip()
+                            if ra and not ra.startswith('RA'):
+                                info_aluno['ra'] = ra
+                                print(f"RA encontrado: {ra}")
+                        except:
+                            pass
+                    elif 'Escola:' in texto:
+                        try:
+                            escola = str(df.iloc[i, j+1]).strip() if j+1 < len(row) else str(df.iloc[i+1, 0]).strip()
+                            if escola and not escola.startswith('Escola'):
+                                info_aluno['escola'] = escola
+                                print(f"Escola encontrada: {escola}")
+                        except:
+                            pass
+                    elif 'Turma:' in texto:
+                        try:
+                            turma = str(df.iloc[i, j+1]).strip() if j+1 < len(row) else str(df.iloc[i+1, 0]).strip()
+                            if turma and not turma.startswith('Turma'):
+                                info_aluno['turma'] = turma
+                                print(f"Turma encontrada: {turma}")
+                        except:
+                            pass
+        # Extrair tabela de notas usando lattice (melhor para tabelas estruturadas)
+        tables_notas = camelot.read_pdf(
+            pdf_path,
+            pages='all',
+            flavor='lattice'
+        )
+        print(f"\nTabelas de notas extraídas: {len(tables_notas)}")
+        # Encontrar a tabela de notas
         df_notas = None
+        for table in tables_notas:
             df_temp = table.df
+            if 'Disciplina' in str(df_temp.iloc[0,0]):
+                df_notas = df_temp
                 df_notas = df_notas.rename(columns={
                     0: 'Disciplina',
                     1: 'Nota B1', 2: 'Freq B1', 3: '%Freq B1', 4: 'AC B1',
                     13: 'Nota B4', 14: 'Freq B4', 15: '%Freq B4', 16: 'AC B4',
                     17: 'CF', 18: 'Nota Final', 19: 'Freq Final', 20: 'AC Final'
                 })
+                break
         if df_notas is None:
             raise ValueError("Tabela de notas não encontrada")
         # Adicionar informações do aluno ao DataFrame
         df_notas.attrs.update(info_aluno)
+        print("\nInformações finais encontradas:")
         for campo, valor in info_aluno.items():
             print(f"{campo}: {valor}")
         return df_notas
     except Exception as e: