Training in progress epoch 0

Browse files

Files changed (5) hide show

README.md +4 -6
test_2 copy.ipynb +0 -0
test_2.ipynb +408 -0
tf_model.h5 +1 -1
tokenizer.json +3 -3

README.md CHANGED Viewed

@@ -16,9 +16,9 @@ probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [distilbert/distilbert-base-uncased](https://huggingface.co/distilbert/distilbert-base-uncased) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Train Loss: 5.6767
-- Validation Loss: 5.6980
-- Epoch: 2
 ## Model description
@@ -44,9 +44,7 @@ The following hyperparameters were used during training:
 | Train Loss | Validation Loss | Epoch |
 |:----------:|:---------------:|:-----:|
-| 5.8854     | 5.7677          | 0     |
-| 5.7170     | 5.6980          | 1     |
-| 5.6767     | 5.6980          | 2     |
 ### Framework versions

 This model is a fine-tuned version of [distilbert/distilbert-base-uncased](https://huggingface.co/distilbert/distilbert-base-uncased) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Train Loss: 6.2045
+- Validation Loss: 6.1385
+- Epoch: 0
 ## Model description
 | Train Loss | Validation Loss | Epoch |
 |:----------:|:---------------:|:-----:|
+| 6.2045     | 6.1385          | 0     |
 ### Framework versions

test_2 copy.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

test_2.ipynb ADDED Viewed

	@@ -0,0 +1,408 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !python3 -m venv env \n",
+    "# !source env/bin/activate \n",
+    "# !pip3 install langchain\n",
+    "# !pip3 install pypdf2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import PyPDF2\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"bk_example.pdf\", \"rb\") as file:\n",
+    "    reader = PyPDF2.PdfReader(file)\n",
+    "    text_all = ''\n",
+    "    # Extract text from each page\n",
+    "    for page_num in range(len(reader.pages)):\n",
+    "        page = reader.pages[page_num]\n",
+    "        text = page.extract_text()\n",
+    "        text_all = text_all +text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
+    "os.environ[\"LANGCHAIN_API_KEY\"] = getpass.getpass()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Optional\n",
+    "\n",
+    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
+    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
+    "\n",
+    "# Define a custom prompt to provide instructions and any additional context.\n",
+    "# 1) You can add examples into the prompt template to improve extraction quality\n",
+    "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n",
+    "#    about the document from which the text was extracted.)\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\n",
+    "            \"system\",\n",
+    "            \"You are an expert extraction algorithm. \"\n",
+    "            \"Only extract relevant information from the text. \"\n",
+    "            \"If you do not know the value of an attribute asked to extract, \"\n",
+    "            \"return null for the attribute's value.\",\n",
+    "        ),\n",
+    "        # Please see the how-to about improving performance with\n",
+    "        # reference examples.\n",
+    "        # MessagesPlaceholder('examples'),\n",
+    "        (\"human\", \"{text}\"),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Optional\n",
+    "\n",
+    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
+    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
+    "\n",
+    "# Define a custom prompt to provide instructions and any additional context.\n",
+    "# 1) You can add examples into the prompt template to improve extraction quality\n",
+    "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n",
+    "#    about the document from which the text was extracted.)\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\n",
+    "            \"system\",\n",
+    "            \"You are an expert extraction algorithm. \"\n",
+    "            \"Only extract relevant information from the text. \"\n",
+    "            \"If you do not know the value of an attribute asked to extract, \"\n",
+    "            \"return null for the attribute's value.\",\n",
+    "        ),\n",
+    "        # Please see the how-to about improving performance with\n",
+    "        # reference examples.\n",
+    "        # MessagesPlaceholder('examples'),\n",
+    "        (\"human\", \"{text}\"),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'langchain_mistralai'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_mistralai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatMistralAI\n\u001b[1;32m      3\u001b[0m llm \u001b[38;5;241m=\u001b[39m ChatMistralAI(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmistral-large-latest\u001b[39m\u001b[38;5;124m\"\u001b[39m, temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m      5\u001b[0m runnable \u001b[38;5;241m=\u001b[39m prompt \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mPerson)\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_mistralai'"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_mistralai import ChatMistralAI\n",
+    "\n",
+    "llm = ChatMistralAI(model=\"mistral-large-latest\", temperature=0)\n",
+    "\n",
+    "runnable = prompt | llm.with_structured_output(schema=Person)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List, Optional\n",
+    "\n",
+    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
+    "\n",
+    "\n",
+    "class Bankruptcy(BaseModel):\n",
+    "    \"\"\"Information about a bankruptcy declaration.\"\"\"\n",
+    "\n",
+    "    # ^ Doc-string for the entity Person.\n",
+    "    # This doc-string is sent to the LLM as the description of the schema Person,\n",
+    "    # and it can help to improve extraction results.\n",
+    "\n",
+    "    # Note that:\n",
+    "    # 1. Each field is an `optional` -- this allows the model to decline to extract it!\n",
+    "    # 2. Each field has a `description` -- this description is used by the LLM.\n",
+    "    # Having a good description can help improve extraction results.\n",
+    "    ssns: Optional[list] = Field(default=None, description=\"The ssns of the persons\")\n",
+    "    chapter: Optional[str] = Field(\n",
+    "        default=None, description=\"The chapter of the bankruptcy declaration\"\n",
+    "    )\n",
+    "    country: Optional[str] = Field(\n",
+    "        default=None, description=\"Country were the bankruptcy declaration is made\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Data(BaseModel):\n",
+    "    \"\"\"Extracted data about bankruptcy declaration..\"\"\"\n",
+    "\n",
+    "    # Creates a model so that we can extract multiple entities.\n",
+    "    people: List[Bankruptcy]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'prompt' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m runnable \u001b[38;5;241m=\u001b[39m \u001b[43mprompt\u001b[49m \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mData)\n\u001b[1;32m      2\u001b[0m runnable\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m: text_all})\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'prompt' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "runnable = prompt | llm.with_structured_output(schema=Data)\n",
+    "runnable.invoke({\"text\": text_all})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#print(text_all)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Find SSNs\n",
+    "ssn_pattern = r'\\b(?:Social Security number|ITIN)\\D*(\\d{3}[−\\s]\\d{2}[−\\s]\\d{4})\\b'\n",
+    "ssns = re.findall(ssn_pattern, text_all)\n",
+    "\n",
+    "def find_ssns(text):\n",
+    "    ssns = re.findall(ssn_pattern, text_all)\n",
+    "    return ssns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Find chapter\n",
+    "chapter_pattern = r'Notice of Chapter (\\d+) Bankruptcy Case \\d{1,2}/\\d{2}'\n",
+    "\n",
+    "def find_chapter(text):\n",
+    "    chapters = re.findall(chapter_pattern, text_all)\n",
+    "    return chapters[0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "country_code = {\"United States\": \"US\", \"Canada\":\"CA\"}\n",
+    "\n",
+    "country_pattern = r'\\b(?:United States|Canada)\\b'\n",
+    "\n",
+    "def find_country_code(text):\n",
+    "    country_match = re.search(country_pattern, text, re.IGNORECASE)\n",
+    "    return country_code.get(country_match[0],None) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Find State\n",
+    "state_pattern = r'\\nDistrict of (\\w+)'\n",
+    "\n",
+    "# Dictionaries for state codes\n",
+    "us_states = {\n",
+    "    \"Alabama\": \"AL\", \"Alaska\": \"AK\", \"Arizona\": \"AZ\", \"Arkansas\": \"AR\", \"California\": \"CA\",\n",
+    "    \"Colorado\": \"CO\", \"Connecticut\": \"CT\", \"Delaware\": \"DE\", \"Florida\": \"FL\", \"Georgia\": \"GA\",\n",
+    "    \"Hawaii\": \"HI\", \"Idaho\": \"ID\", \"Illinois\": \"IL\", \"Indiana\": \"IN\", \"Iowa\": \"IA\",\n",
+    "    \"Kansas\": \"KS\", \"Kentucky\": \"KY\", \"Louisiana\": \"LA\", \"Maine\": \"ME\", \"Maryland\": \"MD\",\n",
+    "    \"Massachusetts\": \"MA\", \"Michigan\": \"MI\", \"Minnesota\": \"MN\", \"Mississippi\": \"MS\", \"Missouri\": \"MO\",\n",
+    "    \"Montana\": \"MT\", \"Nebraska\": \"NE\", \"Nevada\": \"NV\", \"New Hampshire\": \"NH\", \"New Jersey\": \"NJ\",\n",
+    "    \"New Mexico\": \"NM\", \"New York\": \"NY\", \"North Carolina\": \"NC\", \"North Dakota\": \"ND\", \"Ohio\": \"OH\",\n",
+    "    \"Oklahoma\": \"OK\", \"Oregon\": \"OR\", \"Pennsylvania\": \"PA\", \"Rhode Island\": \"RI\", \"South Carolina\": \"SC\",\n",
+    "    \"South Dakota\": \"SD\", \"Tennessee\": \"TN\", \"Texas\": \"TX\", \"Utah\": \"UT\", \"Vermont\": \"VT\",\n",
+    "    \"Virginia\": \"VA\", \"Washington\": \"WA\", \"West Virginia\": \"WV\", \"Wisconsin\": \"WI\", \"Wyoming\": \"WY\"\n",
+    "}\n",
+    "\n",
+    "canadian_provinces = {\n",
+    "    \"Alberta\": \"AB\", \"British Columbia\": \"BC\", \"Manitoba\": \"MB\", \"New Brunswick\": \"NB\", \"Newfoundland and Labrador\": \"NL\",\n",
+    "    \"Northwest Territories\": \"NT\", \"Nova Scotia\": \"NS\", \"Nunavut\": \"NU\", \"Ontario\": \"ON\", \"Prince Edward Island\": \"PE\",\n",
+    "    \"Quebec\": \"QC\", \"Saskatchewan\": \"SK\", \"Yukon\": \"YT\"\n",
+    "}\n",
+    "\n",
+    "def find_state_code(text,country_code):\n",
+    "    state_match = re.search(state_pattern, text)\n",
+    "    \n",
+    "    if state_match:\n",
+    "        # Extract the state or province name from the match\n",
+    "        state_name = state_match.group(1).strip()\n",
+    "    \n",
+    "    if country_code == 'US':\n",
+    "        state_code = us_states.get(state_name,None)\n",
+    "    elif country_code == 'CA':\n",
+    "        state_code = canadian_provinces.get(state_name,None)\n",
+    "    else:\n",
+    "        state_code = None\n",
+    "    \n",
+    "    return state_code\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Find stage\n",
+    "stage_patterns = {\n",
+    "    'Petition': r'\\b(case filed|petition filed|automatic stay)\\b',\n",
+    "    'Discharge': r'\\b(discharge of debts|discharge order|case discharged)\\b',\n",
+    "    'Dismissed': r'\\b(case dismissed|dismissal|converted to Chapter 7)\\b'\n",
+    "}\n",
+    "\n",
+    "# Function to categorize bankruptcy stages from text\n",
+    "def categorize_stage(text):\n",
+    "    categorized_stages = {'Petition': False, 'Discharge': False, 'Dismissed': False}\n",
+    "    \n",
+    "    for stage, pattern in stage_patterns.items():\n",
+    "        if re.search(pattern, text, re.IGNORECASE):\n",
+    "            categorized_stages[stage] = True\n",
+    "    \n",
+    "    # Determine the final stage based on the presence of keywords\n",
+    "    if categorized_stages['Petition']:\n",
+    "        return 'Petition'\n",
+    "    elif categorized_stages['Discharge']:\n",
+    "        return 'Discharge'\n",
+    "    elif categorized_stages['Dismissed']:\n",
+    "        return 'Dismissed'\n",
+    "    else:\n",
+    "        return 'Unknown'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data found: {'ssns': ['461−81−0513', '529−97−1200'], 'chapter': '13', 'country_code': 'US', 'state': 'UT', 'stage': 'Petition'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = { \"ssns\": find_ssns(text_all),\n",
+    "        \"chapter\": find_chapter(text_all),\n",
+    "        \"country_code\": find_country_code(text_all),\n",
+    "        \"state\": find_state_code(text_all, find_country_code(text_all)),\n",
+    "        \"stage\": categorize_stage(text_all)\n",
+    "        }\n",
+    "\n",
+    "print(f\"Data found: {data}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

tf_model.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:93da27aaa4b11ffaa209720a01eb150ecdf0fd014c1bd1f5e623c90b293e841c
 size 265583592

 version https://git-lfs.github.com/spec/v1
+oid sha256:142c651418d7bc9ced202b8cb5b0154f7378acee3eb6d55b7bea7affea11c187
 size 265583592

tokenizer.json CHANGED Viewed

@@ -2,13 +2,13 @@
   "version": "1.0",
   "truncation": {
     "direction": "Right",
-    "max_length": 384,
     "strategy": "OnlySecond",
-    "stride": 0
   },
   "padding": {
     "strategy": {
-      "Fixed": 384
     },
     "direction": "Right",
     "pad_to_multiple_of": null,

   "version": "1.0",
   "truncation": {
     "direction": "Right",
+    "max_length": 512,
     "strategy": "OnlySecond",
+    "stride": 128
   },
   "padding": {
     "strategy": {
+      "Fixed": 512
     },
     "direction": "Right",
     "pad_to_multiple_of": null,