Fine tuned on DocVQA Dataset 40000 questions

import json
from glob import glob
from transformers import AutoProcessor, AutoModelForDocumentQuestionAnswering

import torch
import numpy as np

model_name = "TusharGoel/LayoutLMv2-finetuned-docvqa"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_name)


def pipeline(question, words, boxes, **kwargs):
    
    images = kwargs["images"]
    try:
        encoding = processor(
            images, question, words,boxes = boxes, return_token_type_ids=True, return_tensors="pt", truncation = True
        )
        word_ids = encoding.word_ids(0)

        outputs = model(**encoding)
        
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits
        

        start, end = word_ids[start_scores.argmax(-1)], word_ids[end_scores.argmax(-1)]
        answer = " ".join(words[start : end + 1])

        start_scores, end_scores = start_scores.detach().numpy(), end_scores.detach().numpy()
        undesired_tokens = encoding['attention_mask']
        undesired_tokens_mask = undesired_tokens == 0.0

        start_ = np.where(undesired_tokens_mask, -10000.0, start_scores)
        end_ = np.where(undesired_tokens_mask, -10000.0, end_scores)
        start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
        end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))

        outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))
        max_answer_len = 20
        candidates = np.tril(np.triu(outer), max_answer_len - 1)
        scores_flat = candidates.flatten()

        idx_sort = [np.argmax(scores_flat)]
        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]

        scores = candidates[0, start, end]
        score = scores[0]
    except Exception as e:
        answer, score = "", 0.0
    return answer, score
Downloads last month
13
Safetensors
Model size
200M params
Tensor type
F32
ยท
Inference Providers NEW
This model is not currently available via any of the supported third-party Inference Providers, and the model is not deployed on the HF Inference API.

Spaces using TusharGoel/LayoutLMv2-finetuned-docvqa 2