Spaces:

eduardmtz
/

www

Running

App Files Files Community

www / entrenament-pdf.html

eduardmtz

Update entrenament-pdf.html

eb7343a verified 11 months ago

raw

history blame contribute delete

4 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Train Model from PDFs</title>
	<a href="entrenament-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Entrenament PDF</a>
	<a href="preguntar-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Preguntar PDF</a>
	<br><br>
	<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
	<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
	</head>
	<body>
	<h1>Train Model from PDFs</h1>
	<input type="file" id="fileInput" multiple>
	<button id="trainModel">Train Model</button>
	<pre id="status"></pre>

	<script>
	async function extractTextFromPDF(file) {
	const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
	let text = '';
	for (let i = 1; i <= pdf.numPages; i++) {
	const page = await pdf.getPage(i);
	const content = await page.getTextContent();
	text += content.items.map(item => item.str).join(' ') + ' ';
	}
	return text;
	}

	async function trainModel(data) {
	const model = tf.sequential();

	model.add(tf.layers.dense({
	units: 128,
	activation: 'relu',
	inputShape: [data[0].length]
	}));

	model.add(tf.layers.dense({ units: 64, activation: 'relu' }));
	model.add(tf.layers.dense({ units: 1, activation: 'sigmoid' }));

	model.compile({
	optimizer: 'adam',
	loss: 'binaryCrossentropy',
	metrics: ['accuracy']
	});

	const inputs = tf.tensor2d(data.map(d => d.input));
	const labels = tf.tensor1d(data.map(d => d.label));

	document.getElementById('status').textContent = 'Training the model...';

	await model.fit(inputs, labels, {
	epochs: 10,
	callbacks: {
	onEpochEnd: (epoch, logs) => {
	console.log(`Epoch ${epoch}: loss = ${logs.loss}`);
	document.getElementById('status').textContent = `Epoch ${epoch + 1}: Loss = ${logs.loss}`;
	}
	}
	});

	document.getElementById('status').textContent = 'Saving the model to IndexedDB...';

	try {
	await model.save('indexeddb://pdf-trained-model');
	document.getElementById('status').textContent = 'Model saved successfully in IndexedDB!';
	} catch (err) {
	document.getElementById('status').textContent = 'Error saving the model: ' + err.message;
	console.error('Error saving the model:', err);
	}
	}

	document.getElementById('trainModel').addEventListener('click', async () => {
	const files = document.getElementById('fileInput').files;
	if (!files.length) {
	document.getElementById('status').textContent = 'Please select PDF files to train the model.';
	return;
	}

	const data = [];
	document.getElementById('status').textContent = 'Extracting text from PDFs...';

	for (const file of files) {
	const text = await extractTextFromPDF(file);
	const tokens = text.split(/\s+/).map(word => word.length); // Example: using word lengths as features

	data.push({
	input: tokens.slice(0, 10), // Use the first 10 tokens as input features
	label: 1 // Example label (adjust as needed for your use case)
	});
	}

	document.getElementById('status').textContent = 'Training the model...';
	await trainModel(data);
	});
	</script>
	</body>
	</html>