Spaces:

eduardmtz
/

www

Running

File size: 3,997 Bytes

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Train Model from PDFs</title>
    <a href="entrenament-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Entrenament PDF</a>
    <a href="preguntar-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Preguntar PDF</a>
    <br><br>
    <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
    <script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
</head>
<body>
    <h1>Train Model from PDFs</h1>
    <input type="file" id="fileInput" multiple>
    <button id="trainModel">Train Model</button>
    <pre id="status"></pre>

    <script>
        async function extractTextFromPDF(file) {
            const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
            let text = '';
            for (let i = 1; i <= pdf.numPages; i++) {
                const page = await pdf.getPage(i);
                const content = await page.getTextContent();
                text += content.items.map(item => item.str).join(' ') + ' ';
            }
            return text;
        }

        async function trainModel(data) {
            const model = tf.sequential();

            model.add(tf.layers.dense({
                units: 128,
                activation: 'relu',
                inputShape: [data[0].length]
            }));

            model.add(tf.layers.dense({ units: 64, activation: 'relu' }));
            model.add(tf.layers.dense({ units: 1, activation: 'sigmoid' }));

            model.compile({
                optimizer: 'adam',
                loss: 'binaryCrossentropy',
                metrics: ['accuracy']
            });

            const inputs = tf.tensor2d(data.map(d => d.input));
            const labels = tf.tensor1d(data.map(d => d.label));

            document.getElementById('status').textContent = 'Training the model...';

            await model.fit(inputs, labels, {
                epochs: 10,
                callbacks: {
                    onEpochEnd: (epoch, logs) => {
                        console.log(`Epoch ${epoch}: loss = ${logs.loss}`);
                        document.getElementById('status').textContent = `Epoch ${epoch + 1}: Loss = ${logs.loss}`;
                    }
                }
            });

            document.getElementById('status').textContent = 'Saving the model to IndexedDB...';

            try {
                await model.save('indexeddb://pdf-trained-model');
                document.getElementById('status').textContent = 'Model saved successfully in IndexedDB!';
            } catch (err) {
                document.getElementById('status').textContent = 'Error saving the model: ' + err.message;
                console.error('Error saving the model:', err);
            }
        }

        document.getElementById('trainModel').addEventListener('click', async () => {
            const files = document.getElementById('fileInput').files;
            if (!files.length) {
                document.getElementById('status').textContent = 'Please select PDF files to train the model.';
                return;
            }

            const data = [];
            document.getElementById('status').textContent = 'Extracting text from PDFs...';

            for (const file of files) {
                const text = await extractTextFromPDF(file);
                const tokens = text.split(/\s+/).map(word => word.length); // Example: using word lengths as features

                data.push({
                    input: tokens.slice(0, 10), // Use the first 10 tokens as input features
                    label: 1 // Example label (adjust as needed for your use case)
                });
            }

            document.getElementById('status').textContent = 'Training the model...';
            await trainModel(data);
        });
    </script>
</body>
</html>