ImageDataExtractor2

Sleeping

App Files Files Community

WebashalarForML commited on Sep 26, 2024

Commit

9ae46f4

verified ·

1 Parent(s): ddc0102

Upload 5 files

Browse files

Files changed (5) hide show

backup/backup.py +75 -0
backup/model.py +412 -0
backup/requirements.txt +6 -0
backup/save_load.py +20 -0
backup/train.py +131 -0

backup/backup.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from .model import GLiNER
+# Initialize GLiNER with the base model
+model = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")
+# Sample text for entity prediction
+text = """
+ lenskart m: (0)9428002330 Lenskart Store,Surat m: (0)9723817060) e:lenskartsurat@gmail.com Store Address UG-4.Ascon City.Opp.Maheshwari Bhavan,Citylight,Surat-395007"""
+# Labels for entity prediction
+# # Most GLiNER models should work best when entity types are in lower case or title case
+# labels = ["Person", "Mail", "Number", "Address", "Organization","Designation"]
+# # Perform entity prediction
+# entities = model.predict_entities(text, labels, threshold=0.5)
+def NER_Model(text):
+    labels = ["Person", "Mail", "Number", "Address", "Organization","Designation","Link"]
+    # Perform entity prediction
+    entities = model.predict_entities(text, labels, threshold=0.5)
+    # Initialize the processed data dictionary
+    processed_data = {
+            "Name": [],
+            "Contact": [],
+            "Designation": [],
+            "Address": [],
+            "Link": [],
+            "Company": [],
+            "Email": [],
+            "extracted_text": "",
+            }
+    for entity in entities:
+        print(entity["text"], "=>", entity["label"])
+        #loading the data into json
+        if entity["label"]==labels[0]:
+            processed_data['Name'].extend([entity["text"]])
+        if entity["label"]==labels[1]:
+            processed_data['Email'].extend([entity["text"]])
+        if entity["label"]==labels[2]:
+            processed_data['Contact'].extend([entity["text"]])
+        if entity["label"]==labels[3]:
+            processed_data['Address'].extend([entity["text"]])
+        if entity["label"]==labels[4]:
+            processed_data['Company'].extend([entity["text"]])
+        if entity["label"]==labels[5]:
+            processed_data['Designation'].extend([entity["text"]])
+        if entity["label"]==labels[6]:
+            processed_data['Link'].extend([entity["text"]])
+    processed_data['Address']=[', '.join(processed_data['Address'])]
+    processed_data['extracted_text']=[text]
+    return processed_data
+# result=NER_Model(text)
+# print(result)

backup/model.py ADDED Viewed

	@@ -0,0 +1,412 @@

+import argparse
+import json
+from pathlib import Path
+import re
+from typing import Dict, Optional, Union
+import torch
+import torch.nn.functional as F
+from .modules.layers import LstmSeq2SeqEncoder
+from .modules.base import InstructBase
+from .modules.evaluator import Evaluator, greedy_search
+from .modules.span_rep import SpanRepLayer
+from .modules.token_rep import TokenRepLayer
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+from huggingface_hub.utils import HfHubHTTPError
+class GLiNER(InstructBase, PyTorchModelHubMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # [ENT] token
+        self.entity_token = "<<ENT>>"
+        self.sep_token = "<<SEP>>"
+        # usually a pretrained bidirectional transformer, returns first subtoken representation
+        self.token_rep_layer = TokenRepLayer(model_name=config.model_name, fine_tune=config.fine_tune,
+                                             subtoken_pooling=config.subtoken_pooling, hidden_size=config.hidden_size,
+                                             add_tokens=[self.entity_token, self.sep_token])
+        # hierarchical representation of tokens
+        self.rnn = LstmSeq2SeqEncoder(
+            input_size=config.hidden_size,
+            hidden_size=config.hidden_size // 2,
+            num_layers=1,
+            bidirectional=True,
+        )
+        # span representation
+        self.span_rep_layer = SpanRepLayer(
+            span_mode=config.span_mode,
+            hidden_size=config.hidden_size,
+            max_width=config.max_width,
+            dropout=config.dropout,
+        )
+        # prompt representation (FFN)
+        self.prompt_rep_layer = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size * 4),
+            nn.Dropout(config.dropout),
+            nn.ReLU(),
+            nn.Linear(config.hidden_size * 4, config.hidden_size)
+        )
+    def compute_score_train(self, x):
+        span_idx = x['span_idx'] * x['span_mask'].unsqueeze(-1)
+        new_length = x['seq_length'].clone()
+        new_tokens = []
+        all_len_prompt = []
+        num_classes_all = []
+        # add prompt to the tokens
+        for i in range(len(x['tokens'])):
+            all_types_i = list(x['classes_to_id'][i].keys())
+            # multiple entity types in all_types. Prompt is appended at the start of tokens
+            entity_prompt = []
+            num_classes_all.append(len(all_types_i))
+            # add enity types to prompt
+            for entity_type in all_types_i:
+                entity_prompt.append(self.entity_token)  # [ENT] token
+                entity_prompt.append(entity_type)  # entity type
+            entity_prompt.append(self.sep_token)  # [SEP] token
+            # prompt format:
+            # [ENT] entity_type [ENT] entity_type ... [ENT] entity_type [SEP]
+            # add prompt to the tokens
+            tokens_p = entity_prompt + x['tokens'][i]
+            # input format:
+            # [ENT] entity_type_1 [ENT] entity_type_2 ... [ENT] entity_type_m [SEP] token_1 token_2 ... token_n
+            # update length of the sequence (add prompt length to the original length)
+            new_length[i] = new_length[i] + len(entity_prompt)
+            # update tokens
+            new_tokens.append(tokens_p)
+            # store prompt length
+            all_len_prompt.append(len(entity_prompt))
+        # create a mask using num_classes_all (0, if it exceeds the number of classes, 1 otherwise)
+        max_num_classes = max(num_classes_all)
+        entity_type_mask = torch.arange(max_num_classes).unsqueeze(0).expand(len(num_classes_all), -1).to(
+            x['span_mask'].device)
+        entity_type_mask = entity_type_mask < torch.tensor(num_classes_all).unsqueeze(-1).to(
+            x['span_mask'].device)  # [batch_size, max_num_classes]
+        # compute all token representations
+        bert_output = self.token_rep_layer(new_tokens, new_length)
+        word_rep_w_prompt = bert_output["embeddings"]  # embeddings for all tokens (with prompt)
+        mask_w_prompt = bert_output["mask"]  # mask for all tokens (with prompt)
+        # get word representation (after [SEP]), mask (after [SEP]) and entity type representation (before [SEP])
+        word_rep = []  # word representation (after [SEP])
+        mask = []  # mask (after [SEP])
+        entity_type_rep = []  # entity type representation (before [SEP])
+        for i in range(len(x['tokens'])):
+            prompt_entity_length = all_len_prompt[i]  # length of prompt for this example
+            # get word representation (after [SEP])
+            word_rep.append(word_rep_w_prompt[i, prompt_entity_length:prompt_entity_length + x['seq_length'][i]])
+            # get mask (after [SEP])
+            mask.append(mask_w_prompt[i, prompt_entity_length:prompt_entity_length + x['seq_length'][i]])
+            # get entity type representation (before [SEP])
+            entity_rep = word_rep_w_prompt[i, :prompt_entity_length - 1]  # remove [SEP]
+            entity_rep = entity_rep[0::2]  # it means that we take every second element starting from the second one
+            entity_type_rep.append(entity_rep)
+        # padding for word_rep, mask and entity_type_rep
+        word_rep = pad_sequence(word_rep, batch_first=True)  # [batch_size, seq_len, hidden_size]
+        mask = pad_sequence(mask, batch_first=True)  # [batch_size, seq_len]
+        entity_type_rep = pad_sequence(entity_type_rep, batch_first=True)  # [batch_size, len_types, hidden_size]
+        # compute span representation
+        word_rep = self.rnn(word_rep, mask)
+        span_rep = self.span_rep_layer(word_rep, span_idx)
+        # compute final entity type representation (FFN)
+        entity_type_rep = self.prompt_rep_layer(entity_type_rep)  # (batch_size, len_types, hidden_size)
+        num_classes = entity_type_rep.shape[1]  # number of entity types
+        # similarity score
+        scores = torch.einsum('BLKD,BCD->BLKC', span_rep, entity_type_rep)
+        return scores, num_classes, entity_type_mask
+    def forward(self, x):
+        # compute span representation
+        scores, num_classes, entity_type_mask = self.compute_score_train(x)
+        batch_size = scores.shape[0]
+        # loss for filtering classifier
+        logits_label = scores.view(-1, num_classes)
+        labels = x["span_label"].view(-1)  # (batch_size * num_spans)
+        mask_label = labels != -1  # (batch_size * num_spans)
+        labels.masked_fill_(~mask_label, 0)  # Set the labels of padding tokens to 0
+        # one-hot encoding
+        labels_one_hot = torch.zeros(labels.size(0), num_classes + 1, dtype=torch.float32).to(scores.device)
+        labels_one_hot.scatter_(1, labels.unsqueeze(1), 1)  # Set the corresponding index to 1
+        labels_one_hot = labels_one_hot[:, 1:]  # Remove the first column
+        # Shape of labels_one_hot: (batch_size * num_spans, num_classes)
+        # compute loss (without reduction)
+        all_losses = F.binary_cross_entropy_with_logits(logits_label, labels_one_hot,
+                                                        reduction='none')
+        # mask loss using entity_type_mask (B, C)
+        masked_loss = all_losses.view(batch_size, -1, num_classes) * entity_type_mask.unsqueeze(1)
+        all_losses = masked_loss.view(-1, num_classes)
+        # expand mask_label to all_losses
+        mask_label = mask_label.unsqueeze(-1).expand_as(all_losses)
+        # put lower loss for in label_one_hot (2 for positive, 1 for negative)
+        weight_c = labels_one_hot + 1
+        # apply mask
+        all_losses = all_losses * mask_label.float() * weight_c
+        return all_losses.sum()
+    def compute_score_eval(self, x, device):
+        # check if classes_to_id is dict
+        assert isinstance(x['classes_to_id'], dict), "classes_to_id must be a dict"
+        span_idx = (x['span_idx'] * x['span_mask'].unsqueeze(-1)).to(device)
+        all_types = list(x['classes_to_id'].keys())
+        # multiple entity types in all_types. Prompt is appended at the start of tokens
+        entity_prompt = []
+        # add enity types to prompt
+        for entity_type in all_types:
+            entity_prompt.append(self.entity_token)
+            entity_prompt.append(entity_type)
+        entity_prompt.append(self.sep_token)
+        prompt_entity_length = len(entity_prompt)
+        # add prompt
+        tokens_p = [entity_prompt + tokens for tokens in x['tokens']]
+        seq_length_p = x['seq_length'] + prompt_entity_length
+        out = self.token_rep_layer(tokens_p, seq_length_p)
+        word_rep_w_prompt = out["embeddings"]
+        mask_w_prompt = out["mask"]
+        # remove prompt
+        word_rep = word_rep_w_prompt[:, prompt_entity_length:, :]
+        mask = mask_w_prompt[:, prompt_entity_length:]
+        # get_entity_type_rep
+        entity_type_rep = word_rep_w_prompt[:, :prompt_entity_length - 1, :]
+        # extract [ENT] tokens (which are at even positions in entity_type_rep)
+        entity_type_rep = entity_type_rep[:, 0::2, :]
+        entity_type_rep = self.prompt_rep_layer(entity_type_rep)  # (batch_size, len_types, hidden_size)
+        word_rep = self.rnn(word_rep, mask)
+        span_rep = self.span_rep_layer(word_rep, span_idx)
+        local_scores = torch.einsum('BLKD,BCD->BLKC', span_rep, entity_type_rep)
+        return local_scores
+    @torch.no_grad()
+    def predict(self, x, flat_ner=False, threshold=0.5):
+        self.eval()
+        local_scores = self.compute_score_eval(x, device=next(self.parameters()).device)
+        spans = []
+        for i, _ in enumerate(x["tokens"]):
+            local_i = local_scores[i]
+            wh_i = [i.tolist() for i in torch.where(torch.sigmoid(local_i) > threshold)]
+            span_i = []
+            for s, k, c in zip(*wh_i):
+                if s + k < len(x["tokens"][i]):
+                    span_i.append((s, s + k, x["id_to_classes"][c + 1], local_i[s, k, c]))
+            span_i = greedy_search(span_i, flat_ner)
+            spans.append(span_i)
+        return spans
+    def predict_entities(self, text, labels, flat_ner=True, threshold=0.5):
+        tokens = []
+        start_token_idx_to_text_idx = []
+        end_token_idx_to_text_idx = []
+        for match in re.finditer(r'\w+(?:[-_]\w+)*|\S', text):
+            tokens.append(match.group())
+            start_token_idx_to_text_idx.append(match.start())
+            end_token_idx_to_text_idx.append(match.end())
+        input_x = {"tokenized_text": tokens, "ner": None}
+        x = self.collate_fn([input_x], labels)
+        output = self.predict(x, flat_ner=flat_ner, threshold=threshold)
+        entities = []
+        for start_token_idx, end_token_idx, ent_type in output[0]:
+            start_text_idx = start_token_idx_to_text_idx[start_token_idx]
+            end_text_idx = end_token_idx_to_text_idx[end_token_idx]
+            entities.append({
+                "start": start_token_idx_to_text_idx[start_token_idx],
+                "end": end_token_idx_to_text_idx[end_token_idx],
+                "text": text[start_text_idx:end_text_idx],
+                "label": ent_type,
+            })
+        return entities
+    def evaluate(self, test_data, flat_ner=False, threshold=0.5, batch_size=12, entity_types=None):
+        self.eval()
+        data_loader = self.create_dataloader(test_data, batch_size=batch_size, entity_types=entity_types, shuffle=False)
+        device = next(self.parameters()).device
+        all_preds = []
+        all_trues = []
+        for x in data_loader:
+            for k, v in x.items():
+                if isinstance(v, torch.Tensor):
+                    x[k] = v.to(device)
+            batch_predictions = self.predict(x, flat_ner, threshold)
+            all_preds.extend(batch_predictions)
+            all_trues.extend(x["entities"])
+        evaluator = Evaluator(all_trues, all_preds)
+        out, f1 = evaluator.evaluate()
+        return out, f1
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: Optional[str],
+        cache_dir: Optional[Union[str, Path]],
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: bool,
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu",
+        strict: bool = False,
+        **model_kwargs,
+    ):
+        # 1. Backwards compatibility: Use "gliner_base.pt" and "gliner_multi.pt" with all data
+        filenames = ["gliner_base.pt", "gliner_multi.pt"]
+        for filename in filenames:
+            model_file = Path(model_id) / filename
+            if not model_file.exists():
+                try:
+                    model_file = hf_hub_download(
+                        repo_id=model_id,
+                        filename=filename,
+                        revision=revision,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                        token=token,
+                        local_files_only=local_files_only,
+                    )
+                except HfHubHTTPError:
+                    continue
+            dict_load = torch.load(model_file, map_location=torch.device(map_location))
+            config = dict_load["config"]
+            state_dict = dict_load["model_weights"]
+            config.model_name = "microsoft/deberta-v3-base" if filename == "gliner_base.pt" else "microsoft/mdeberta-v3-base"
+            model = cls(config)
+            model.load_state_dict(state_dict, strict=strict, assign=True)
+            # Required to update flair's internals as well:
+            model.to(map_location)
+            return model
+        # 2. Newer format: Use "pytorch_model.bin" and "gliner_config.json"
+        from .train import load_config_as_namespace
+        model_file = Path(model_id) / "pytorch_model.bin"
+        if not model_file.exists():
+            model_file = hf_hub_download(
+                repo_id=model_id,
+                filename="pytorch_model.bin",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        config_file = Path(model_id) / "gliner_config.json"
+        if not config_file.exists():
+            config_file = hf_hub_download(
+                repo_id=model_id,
+                filename="gliner_config.json",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        config = load_config_as_namespace(config_file)
+        model = cls(config)
+        state_dict = torch.load(model_file, map_location=torch.device(map_location))
+        model.load_state_dict(state_dict, strict=strict, assign=True)
+        model.to(map_location)
+        return model
+    def save_pretrained(
+        self,
+        save_directory: Union[str, Path],
+        *,
+        config: Optional[Union[dict, "DataclassInstance"]] = None,
+        repo_id: Optional[str] = None,
+        push_to_hub: bool = False,
+        **push_to_hub_kwargs,
+    ) -> Optional[str]:
+        """
+        Save weights in local directory.
+        Args:
+            save_directory (`str` or `Path`):
+                Path to directory in which the model weights and configuration will be saved.
+            config (`dict` or `DataclassInstance`, *optional*):
+                Model configuration specified as a key/value dictionary or a dataclass instance.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Huggingface Hub after saving it.
+            repo_id (`str`, *optional*):
+                ID of your repository on the Hub. Used only if `push_to_hub=True`. Will default to the folder name if
+                not provided.
+            kwargs:
+                Additional key word arguments passed along to the [`~ModelHubMixin.push_to_hub`] method.
+        """
+        save_directory = Path(save_directory)
+        save_directory.mkdir(parents=True, exist_ok=True)
+        # save model weights/files
+        torch.save(self.state_dict(), save_directory / "pytorch_model.bin")
+        # save config (if provided)
+        if config is None:
+            config = self.config
+        if config is not None:
+            if isinstance(config, argparse.Namespace):
+                config = vars(config)
+            (save_directory / "gliner_config.json").write_text(json.dumps(config, indent=2))
+        # push to the Hub if required
+        if push_to_hub:
+            kwargs = push_to_hub_kwargs.copy()  # soft-copy to avoid mutating input
+            if config is not None:  # kwarg for `push_to_hub`
+                kwargs["config"] = config
+            if repo_id is None:
+                repo_id = save_directory.name  # Defaults to `save_directory` name
+            return self.push_to_hub(repo_id=repo_id, **kwargs)
+        return None
+    def to(self, device):
+        super().to(device)
+        import flair
+        flair.device = device
+        return self

backup/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+transformers
+huggingface_hub
+flair
+seqeval
+tqdm

backup/save_load.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+from model import GLiNER
+def save_model(current_model, path):
+    config = current_model.config
+    dict_save = {"model_weights": current_model.state_dict(), "config": config}
+    torch.save(dict_save, path)
+def load_model(path, model_name=None, device=None):
+    dict_load = torch.load(path, map_location=torch.device('cpu'))
+    config = dict_load["config"]
+    if model_name is not None:
+        config.model_name = model_name
+    loaded_model = GLiNER(config)
+    loaded_model.load_state_dict(dict_load["model_weights"])
+    return loaded_model.to(device) if device is not None else loaded_model

backup/train.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import argparse
+import os
+import torch
+import yaml
+from tqdm import tqdm
+from transformers import get_cosine_schedule_with_warmup
+# from model_nested import NerFilteredSemiCRF
+from .model import GLiNER
+from .modules.run_evaluation import get_for_all_path, sample_train_data
+from save_load import save_model, load_model
+import json
+# train function
+def train(model, optimizer, train_data, num_steps=1000, eval_every=100, log_dir="logs", warmup_ratio=0.1,
+          train_batch_size=8, device='cuda'):
+    model.train()
+    # initialize data loaders
+    train_loader = model.create_dataloader(train_data, batch_size=train_batch_size, shuffle=True)
+    pbar = tqdm(range(num_steps))
+    if warmup_ratio < 1:
+        num_warmup_steps = int(num_steps * warmup_ratio)
+    else:
+        num_warmup_steps = int(warmup_ratio)
+    scheduler = get_cosine_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_steps
+    )
+    iter_train_loader = iter(train_loader)
+    for step in pbar:
+        try:
+            x = next(iter_train_loader)
+        except StopIteration:
+            iter_train_loader = iter(train_loader)
+            x = next(iter_train_loader)
+        for k, v in x.items():
+            if isinstance(v, torch.Tensor):
+                x[k] = v.to(device)
+        try:
+            loss = model(x)  # Forward pass
+        except:
+            continue
+        # check if loss is nan
+        if torch.isnan(loss):
+            continue
+        loss.backward()  # Compute gradients
+        optimizer.step()  # Update parameters
+        scheduler.step()  # Update learning rate schedule
+        optimizer.zero_grad()  # Reset gradients
+        description = f"step: {step} | epoch: {step // len(train_loader)} | loss: {loss.item():.2f}"
+        if (step + 1) % eval_every == 0:
+            current_path = os.path.join(log_dir, f'model_{step + 1}')
+            save_model(model, current_path)
+            #val_data_dir =  "/gpfswork/rech/ohy/upa43yu/NER_datasets" # can be obtained from "https://drive.google.com/file/d/1T-5IbocGka35I7X3CE6yKe5N_Xg2lVKT/view"
+            #get_for_all_path(model, step, log_dir, val_data_dir)  # you can remove this comment if you want to evaluate the model
+            model.train()
+        pbar.set_description(description)
+def create_parser():
+    parser = argparse.ArgumentParser(description="Span-based NER")
+    parser.add_argument("--config", type=str, default="config.yaml", help="Path to config file")
+    parser.add_argument('--log_dir', type=str, default='logs', help='Path to the log directory')
+    return parser
+def load_config_as_namespace(config_file):
+    with open(config_file, 'r') as f:
+        config_dict = yaml.safe_load(f)
+    return argparse.Namespace(**config_dict)
+if __name__ == "__main__":
+    # parse args
+    parser = create_parser()
+    args = parser.parse_args()
+    # load config
+    config = load_config_as_namespace(args.config)
+    config.log_dir = args.log_dir
+    try:
+        with open(config.train_data, 'r') as f:
+            data = json.load(f)
+    except:
+        data = sample_train_data(config.train_data, 10000)
+    if config.prev_path != "none":
+        model = load_model(config.prev_path)
+        model.config = config
+    else:
+        model = GLiNER(config)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    lr_encoder = float(config.lr_encoder)
+    lr_others = float(config.lr_others)
+    optimizer = torch.optim.AdamW([
+        # encoder
+        {'params': model.token_rep_layer.parameters(), 'lr': lr_encoder},
+        {'params': model.rnn.parameters(), 'lr': lr_others},
+        # projection layers
+        {'params': model.span_rep_layer.parameters(), 'lr': lr_others},
+        {'params': model.prompt_rep_layer.parameters(), 'lr': lr_others},
+    ])
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    train(model, optimizer, data, num_steps=config.num_steps, eval_every=config.eval_every,
+          log_dir=config.log_dir, warmup_ratio=config.warmup_ratio, train_batch_size=config.train_batch_size,
+          device=device)