File size: 118,371 Bytes
87b6881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !python3 -m venv env \n",
    "# !source env/bin/activate \n",
    "# !pip3 install langchain\n",
    "# !pip3 install pypdf2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting transformers\n",
      "  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m198.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting filelock (from transformers)\n",
      "  Downloading filelock-3.16.0-py3-none-any.whl.metadata (3.0 kB)\n",
      "Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)\n",
      "  Downloading huggingface_hub-0.24.6-py3-none-any.whl.metadata (13 kB)\n",
      "Requirement already satisfied: numpy>=1.17 in ./env/lib/python3.11/site-packages (from transformers) (1.26.4)\n",
      "Requirement already satisfied: packaging>=20.0 in ./env/lib/python3.11/site-packages (from transformers) (24.1)\n",
      "Requirement already satisfied: pyyaml>=5.1 in ./env/lib/python3.11/site-packages (from transformers) (6.0.2)\n",
      "Collecting regex!=2019.12.17 (from transformers)\n",
      "  Downloading regex-2024.7.24-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.5/40.5 kB\u001b[0m \u001b[31m992.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: requests in ./env/lib/python3.11/site-packages (from transformers) (2.32.3)\n",
      "Collecting safetensors>=0.4.1 (from transformers)\n",
      "  Downloading safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.8 kB)\n",
      "Collecting tokenizers<0.20,>=0.19 (from transformers)\n",
      "  Downloading tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)\n",
      "Collecting tqdm>=4.27 (from transformers)\n",
      "  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m966.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)\n",
      "  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./env/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in ./env/lib/python3.11/site-packages (from requests->transformers) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in ./env/lib/python3.11/site-packages (from requests->transformers) (3.8)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./env/lib/python3.11/site-packages (from requests->transformers) (2.2.2)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in ./env/lib/python3.11/site-packages (from requests->transformers) (2024.7.4)\n",
      "Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.5/9.5 MB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m0m\n",
      "\u001b[?25hDownloading huggingface_hub-0.24.6-py3-none-any.whl (417 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m417.5/417.5 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading regex-2024.7.24-cp311-cp311-macosx_11_0_arm64.whl (278 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.9/278.9 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl (381 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m381.5/381.5 kB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl (2.4 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
      "\u001b[?25hDownloading tqdm-4.66.5-py3-none-any.whl (78 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.4/78.4 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading filelock-3.16.0-py3-none-any.whl (16 kB)\n",
      "Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: tqdm, safetensors, regex, fsspec, filelock, huggingface-hub, tokenizers, transformers\n",
      "Successfully installed filelock-3.16.0 fsspec-2024.9.0 huggingface-hub-0.24.6 regex-2024.7.24 safetensors-0.4.5 tokenizers-0.19.1 tqdm-4.66.5 transformers-4.44.2\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install transformers\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting tensorflow\n",
      "  Downloading tensorflow-2.17.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.1 kB)\n",
      "Collecting absl-py>=1.0.0 (from tensorflow)\n",
      "  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)\n",
      "Collecting astunparse>=1.6.0 (from tensorflow)\n",
      "  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)\n",
      "Collecting flatbuffers>=24.3.25 (from tensorflow)\n",
      "  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)\n",
      "Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)\n",
      "  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)\n",
      "Collecting google-pasta>=0.1.1 (from tensorflow)\n",
      "  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)\n",
      "Collecting h5py>=3.10.0 (from tensorflow)\n",
      "  Downloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.5 kB)\n",
      "Collecting libclang>=13.0.0 (from tensorflow)\n",
      "  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)\n",
      "Collecting ml-dtypes<0.5.0,>=0.3.1 (from tensorflow)\n",
      "  Downloading ml_dtypes-0.4.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (20 kB)\n",
      "Collecting opt-einsum>=2.3.2 (from tensorflow)\n",
      "  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)\n",
      "Requirement already satisfied: packaging in ./env/lib/python3.11/site-packages (from tensorflow) (24.1)\n",
      "Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow)\n",
      "  Downloading protobuf-4.25.4-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)\n",
      "Requirement already satisfied: requests<3,>=2.21.0 in ./env/lib/python3.11/site-packages (from tensorflow) (2.32.3)\n",
      "Requirement already satisfied: setuptools in ./env/lib/python3.11/site-packages (from tensorflow) (68.2.2)\n",
      "Requirement already satisfied: six>=1.12.0 in ./env/lib/python3.11/site-packages (from tensorflow) (1.16.0)\n",
      "Collecting termcolor>=1.1.0 (from tensorflow)\n",
      "  Downloading termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)\n",
      "Requirement already satisfied: typing-extensions>=3.6.6 in ./env/lib/python3.11/site-packages (from tensorflow) (4.12.2)\n",
      "Collecting wrapt>=1.11.0 (from tensorflow)\n",
      "  Downloading wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)\n",
      "Collecting grpcio<2.0,>=1.24.3 (from tensorflow)\n",
      "  Downloading grpcio-1.66.1-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.9 kB)\n",
      "Collecting tensorboard<2.18,>=2.17 (from tensorflow)\n",
      "  Downloading tensorboard-2.17.1-py3-none-any.whl.metadata (1.6 kB)\n",
      "Collecting keras>=3.2.0 (from tensorflow)\n",
      "  Downloading keras-3.5.0-py3-none-any.whl.metadata (5.8 kB)\n",
      "Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)\n",
      "  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (14 kB)\n",
      "Requirement already satisfied: numpy<2.0.0,>=1.23.5 in ./env/lib/python3.11/site-packages (from tensorflow) (1.26.4)\n",
      "Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow)\n",
      "  Downloading wheel-0.44.0-py3-none-any.whl.metadata (2.3 kB)\n",
      "Collecting rich (from keras>=3.2.0->tensorflow)\n",
      "  Downloading rich-13.8.1-py3-none-any.whl.metadata (18 kB)\n",
      "Collecting namex (from keras>=3.2.0->tensorflow)\n",
      "  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)\n",
      "Collecting optree (from keras>=3.2.0->tensorflow)\n",
      "  Downloading optree-0.12.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (47 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.8/47.8 kB\u001b[0m \u001b[31m263.9 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (3.8)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (2.2.2)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (2024.7.4)\n",
      "Collecting markdown>=2.6.8 (from tensorboard<2.18,>=2.17->tensorflow)\n",
      "  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)\n",
      "Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard<2.18,>=2.17->tensorflow)\n",
      "  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)\n",
      "Collecting werkzeug>=1.0.1 (from tensorboard<2.18,>=2.17->tensorflow)\n",
      "  Downloading werkzeug-3.0.4-py3-none-any.whl.metadata (3.7 kB)\n",
      "Collecting MarkupSafe>=2.1.1 (from werkzeug>=1.0.1->tensorboard<2.18,>=2.17->tensorflow)\n",
      "  Downloading MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.0 kB)\n",
      "Collecting markdown-it-py>=2.2.0 (from rich->keras>=3.2.0->tensorflow)\n",
      "  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./env/lib/python3.11/site-packages (from rich->keras>=3.2.0->tensorflow) (2.18.0)\n",
      "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich->keras>=3.2.0->tensorflow)\n",
      "  Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n",
      "Downloading tensorflow-2.17.0-cp311-cp311-macosx_12_0_arm64.whl (236.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.2/236.2 MB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading absl_py-2.1.0-py3-none-any.whl (133 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m133.7/133.7 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)\n",
      "Downloading flatbuffers-24.3.25-py2.py3-none-any.whl (26 kB)\n",
      "Downloading gast-0.6.0-py3-none-any.whl (21 kB)\n",
      "Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.5/57.5 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading grpcio-1.66.1-cp311-cp311-macosx_10_9_universal2.whl (10.6 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.6/10.6 MB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl (2.9 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.9/2.9 MB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading keras-3.5.0-py3-none-any.whl (1.1 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl (25.8 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m25.8/25.8 MB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading ml_dtypes-0.4.0-cp311-cp311-macosx_10_9_universal2.whl (390 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m390.9/390.9 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hDownloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.5/65.5 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading protobuf-4.25.4-cp37-abi3-macosx_10_9_universal2.whl (394 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m394.2/394.2 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hDownloading tensorboard-2.17.1-py3-none-any.whl (5.5 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
      "\u001b[?25hDownloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-macosx_12_0_arm64.whl (3.5 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.5/3.5 MB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
      "\u001b[?25hDownloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)\n",
      "Downloading wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl (38 kB)\n",
      "Downloading Markdown-3.7-py3-none-any.whl (106 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m106.3/106.3 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tensorboard_data_server-0.7.2-py3-none-any.whl (2.4 kB)\n",
      "Downloading werkzeug-3.0.4-py3-none-any.whl (227 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.6/227.6 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading wheel-0.44.0-py3-none-any.whl (67 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.1/67.1 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading namex-0.0.8-py3-none-any.whl (5.8 kB)\n",
      "Downloading optree-0.12.1-cp311-cp311-macosx_11_0_arm64.whl (283 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m283.7/283.7 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading rich-13.8.1-py3-none-any.whl (241 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m241.6/241.6 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl (18 kB)\n",
      "Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
      "Installing collected packages: namex, libclang, flatbuffers, wrapt, wheel, termcolor, tensorflow-io-gcs-filesystem, tensorboard-data-server, protobuf, optree, opt-einsum, ml-dtypes, mdurl, MarkupSafe, markdown, h5py, grpcio, google-pasta, gast, absl-py, werkzeug, markdown-it-py, astunparse, tensorboard, rich, keras, tensorflow\n",
      "Successfully installed MarkupSafe-2.1.5 absl-py-2.1.0 astunparse-1.6.3 flatbuffers-24.3.25 gast-0.6.0 google-pasta-0.2.0 grpcio-1.66.1 h5py-3.11.0 keras-3.5.0 libclang-18.1.1 markdown-3.7 markdown-it-py-3.0.0 mdurl-0.1.2 ml-dtypes-0.4.0 namex-0.0.8 opt-einsum-3.3.0 optree-0.12.1 protobuf-4.25.4 rich-13.8.1 tensorboard-2.17.1 tensorboard-data-server-0.7.2 tensorflow-2.17.0 tensorflow-io-gcs-filesystem-0.37.1 termcolor-2.4.0 werkzeug-3.0.4 wheel-0.44.0 wrapt-1.16.0\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install tensorflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found existing installation: keras 3.5.0\n",
      "Uninstalling keras-3.5.0:\n",
      "  Would remove:\n",
      "    /Users/camilayepes/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/keras-3.5.0.dist-info/*\n",
      "    /Users/camilayepes/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/keras/*\n",
      "Proceed (Y/n)? ^C\n",
      "\u001b[31mERROR: Operation cancelled by user\u001b[0m\u001b[31m\n",
      "\u001b[0mCollecting tf-keras\n",
      "  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)\n",
      "Requirement already satisfied: tensorflow<2.18,>=2.17 in ./env/lib/python3.11/site-packages (from tf-keras) (2.17.0)\n",
      "Requirement already satisfied: absl-py>=1.0.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (2.1.0)\n",
      "Requirement already satisfied: astunparse>=1.6.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.6.3)\n",
      "Requirement already satisfied: flatbuffers>=24.3.25 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (24.3.25)\n",
      "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (0.6.0)\n",
      "Requirement already satisfied: google-pasta>=0.1.1 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (0.2.0)\n",
      "Requirement already satisfied: h5py>=3.10.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (3.11.0)\n",
      "Requirement already satisfied: libclang>=13.0.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (18.1.1)\n",
      "Requirement already satisfied: ml-dtypes<0.5.0,>=0.3.1 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (0.4.0)\n",
      "Requirement already satisfied: opt-einsum>=2.3.2 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (3.3.0)\n",
      "Requirement already satisfied: packaging in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (24.1)\n",
      "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (4.25.4)\n",
      "Requirement already satisfied: requests<3,>=2.21.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (2.32.3)\n",
      "Requirement already satisfied: setuptools in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (68.2.2)\n",
      "Requirement already satisfied: six>=1.12.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.16.0)\n",
      "Requirement already satisfied: termcolor>=1.1.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (2.4.0)\n",
      "Requirement already satisfied: typing-extensions>=3.6.6 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (4.12.2)\n",
      "Requirement already satisfied: wrapt>=1.11.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.16.0)\n",
      "Requirement already satisfied: grpcio<2.0,>=1.24.3 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.66.1)\n",
      "Requirement already satisfied: tensorboard<2.18,>=2.17 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (2.17.1)\n",
      "Requirement already satisfied: keras>=3.2.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (3.5.0)\n",
      "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (0.37.1)\n",
      "Requirement already satisfied: numpy<2.0.0,>=1.23.5 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.26.4)\n",
      "Requirement already satisfied: wheel<1.0,>=0.23.0 in ./env/lib/python3.11/site-packages (from astunparse>=1.6.0->tensorflow<2.18,>=2.17->tf-keras) (0.44.0)\n",
      "Requirement already satisfied: rich in ./env/lib/python3.11/site-packages (from keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (13.8.1)\n",
      "Requirement already satisfied: namex in ./env/lib/python3.11/site-packages (from keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (0.0.8)\n",
      "Requirement already satisfied: optree in ./env/lib/python3.11/site-packages (from keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (0.12.1)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow<2.18,>=2.17->tf-keras) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow<2.18,>=2.17->tf-keras) (3.8)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow<2.18,>=2.17->tf-keras) (2.2.2)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow<2.18,>=2.17->tf-keras) (2024.7.4)\n",
      "Requirement already satisfied: markdown>=2.6.8 in ./env/lib/python3.11/site-packages (from tensorboard<2.18,>=2.17->tensorflow<2.18,>=2.17->tf-keras) (3.7)\n",
      "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in ./env/lib/python3.11/site-packages (from tensorboard<2.18,>=2.17->tensorflow<2.18,>=2.17->tf-keras) (0.7.2)\n",
      "Requirement already satisfied: werkzeug>=1.0.1 in ./env/lib/python3.11/site-packages (from tensorboard<2.18,>=2.17->tensorflow<2.18,>=2.17->tf-keras) (3.0.4)\n",
      "Requirement already satisfied: MarkupSafe>=2.1.1 in ./env/lib/python3.11/site-packages (from werkzeug>=1.0.1->tensorboard<2.18,>=2.17->tensorflow<2.18,>=2.17->tf-keras) (2.1.5)\n",
      "Requirement already satisfied: markdown-it-py>=2.2.0 in ./env/lib/python3.11/site-packages (from rich->keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (3.0.0)\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./env/lib/python3.11/site-packages (from rich->keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (2.18.0)\n",
      "Requirement already satisfied: mdurl~=0.1 in ./env/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (0.1.2)\n",
      "Downloading tf_keras-2.17.0-py3-none-any.whl (1.7 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m0m\n",
      "\u001b[?25hInstalling collected packages: tf-keras\n",
      "Successfully installed tf-keras-2.17.0\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip uninstall keras\n",
    "!pip install tf-keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.\n",
      "\n",
      "All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.\n",
      "/Users/camilayepes/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "# Load the pre-trained model\n",
    "nlp = pipeline(\"question-answering\", model=\"distilbert-base-uncased-distilled-squad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "context = \"\"\"\n",
    "Hugging Face is an AI company specializing in Natural Language Processing (NLP). Their transformers library is widely used for various NLP tasks, including text classification, question answering, and summarization.\n",
    "\"\"\"\n",
    "question = \"What does Hugging Face specialize in?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 0.551713228225708, 'start': 47, 'end': 74, 'answer': 'Natural Language Processing'}\n"
     ]
    }
   ],
   "source": [
    "result = nlp(question=question, context=context)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import PyPDF2\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 349,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting pdfplumber\n",
      "  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.0/42.0 kB\u001b[0m \u001b[31m193.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting pdfminer.six==20231228 (from pdfplumber)\n",
      "  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)\n",
      "Collecting Pillow>=9.1 (from pdfplumber)\n",
      "  Downloading pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.2 kB)\n",
      "Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
      "  Downloading pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl.metadata (48 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m955.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in ./env/lib/python3.11/site-packages (from pdfminer.six==20231228->pdfplumber) (3.3.2)\n",
      "Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)\n",
      "  Downloading cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl.metadata (5.4 kB)\n",
      "Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)\n",
      "  Downloading cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (1.5 kB)\n",
      "Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)\n",
      "  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)\n",
      "Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.2/59.2 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m0m\n",
      "\u001b[?25hDownloading pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl (3.4 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
      "\u001b[?25hDownloading pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl (2.7 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
      "\u001b[?25hDownloading cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl (6.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
      "\u001b[?25hDownloading cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl (178 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m178.7/178.7 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading pycparser-2.22-py3-none-any.whl (117 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.6/117.6 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: pypdfium2, pycparser, Pillow, cffi, cryptography, pdfminer.six, pdfplumber\n",
      "Successfully installed Pillow-10.4.0 cffi-1.17.1 cryptography-43.0.1 pdfminer.six-20231228 pdfplumber-0.11.4 pycparser-2.22 pypdfium2-4.30.0\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install pdfplumber\n",
    "#fitz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 351,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "UNITED STATES BANKRUPTCY COURT\n",
      "MIDDLE DISTRICT OF GEORGIA\n",
      "IN RE: CHAPTER 13\n",
      "Ashley Unshandra Haygood\n",
      "DEBTOR(S) CASE NO.24−50911−AEC\n",
      "ORDER OF DISMISSAL\n",
      "It appearing to the Court that the trustee has moved to dismiss the above−captioned case, and sent notice of said\n",
      "motion and hearing date to the debtor(s) and debtor's(s') attorney, if any; and a hearing was held on this motion\n",
      "wherein just cause was found to grant the trustee's motion; and it is hereby\n",
      "ORDERED that this case be and the same is dismissed; and it is further\n",
      "ORDERED that the Clerk of this Court take such action as is appropriate to close this case.\n",
      "Dated: 8/26/24 /s/ Austin E. Carter\n",
      "United States Bankruptcy Judge\n",
      "Powered by TCPDF (www.tcpdf.org)\n",
      "1 / 1\n"
     ]
    }
   ],
   "source": [
    "import pdfplumber\n",
    "\n",
    "pdf_path = \"BK Examples/Ashley Unshandra Haygood.pdf\"\n",
    "text_all = \"\"\n",
    "\n",
    "# Open the PDF file\n",
    "with pdfplumber.open(pdf_path) as pdf:\n",
    "    # Iterate through each page\n",
    "    for page in pdf.pages:\n",
    "        text = page.extract_text()\n",
    "        text_all += text\n",
    "\n",
    "print(text_all)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"BK Examples/Ashley Unshandra Haygood.pdf\", \"rb\") as file:\n",
    "    reader = PyPDF2.PdfReader(file)\n",
    "    text_all = ''\n",
    "    # Extract text from each page\n",
    "    for page_num in range(len(reader.pages)):\n",
    "        page = reader.pages[page_num]\n",
    "        text = page.extract_text()\n",
    "        text_all = text_all +text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 355,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\" \\nUNITED STATES BANKRUPTCY COURT\\nMIDDLE DISTRICT OF GEORGIA\\nIN RE:                               CHAPTER 13\\nAshley Unshandra Haygood\\nDEBTOR(S)                               CASE NO.24−50911−AEC\\nORDER OF DISMISSAL\\n          It appearing to the Court that the trustee has moved to dismiss the above−captioned case, and sent notice of said\\nmotion and hearing date to the debtor(s) and debtor's(s') attorney, if any; and a hearing was held on this motion\\nwherein just cause was found to grant the trustee's motion; and it is hereby\\nORDERED  that this case be and the same is dismissed; and it is further\\nORDERED  that the Clerk of this Court take such action as is appropriate to close this case.\\nDated: 8/26/24 /s/ Austin E. Carter\\nUnited States Bankruptcy JudgePowered by TCPDF (www.tcpdf.org)\\n                               1 / 1\""
      ]
     },
     "execution_count": 355,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "context = \"\"\"\n",
    "Hugging Face is an AI company specializing in Natural Language Processing (NLP). Their transformers library is widely used for various NLP tasks, including text classification, question answering, and summarization.\n",
    "\"\"\"\n",
    "question = \"What is the state according to the country? Like Utah\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 0.0804615467786789, 'start': 330, 'end': 383, 'answer': 'United States Bankruptcy Court   District of UtahDate'}\n"
     ]
    }
   ],
   "source": [
    "result = nlp(question=question, context=text_all)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"Which stage is the proccess at? Answer between petition,discharge or dismissed ?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 0.3394260108470917, 'start': 1563, 'end': 1578, 'answer': 'chapter 13 plan'}\n"
     ]
    }
   ],
   "source": [
    "result = nlp(question=question, context=text_all)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"What are the Social Security number or ITIN?\"\n",
    "question = \"List all the Social Security number or ITIN\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"What are the Social Security number or ITIN?\"\n",
    "question = \"how many Social Security number or ITIN are in the text?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 0.7864278554916382, 'start': 137, 'end': 148, 'answer': '461−81−0513'}\n"
     ]
    }
   ],
   "source": [
    "result = nlp(question=question, context=text_all)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.\n",
      "\n",
      "All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering\n",
    "import tensorflow as tf\n",
    "\n",
    "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-cased-distilled-squad\")\n",
    "model = TFDistilBertForQuestionAnswering.from_pretrained(\"distilbert-base-cased-distilled-squad\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"What is the fifth Social Security number or ITIN in the text?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' \\nInformation to identify the case:\\nDebtor 1 Cristina Nelson\\nFirst Name      Middle Name      Last NameSocial Security number or ITIN    461−81−0513\\nEIN    _ _−_ _ _ _ _ _ _\\nDebtor 2\\n(Spouse, if filing)Timothy Nelson\\nFirst Name      Middle Name      Last NameSocial Security number or ITIN    529−97−1200\\nEIN    _ _−_ _ _ _ _ _ _\\nUnited States Bankruptcy Court   District of UtahDate case filed for chapter  13:   8/7/24Case number:   24−23963   JTM\\nOfficial Form 309I\\nNotice of Chapter 13 Bankruptcy Case 10/20\\nFor the debtors listed above, a case has been filed under chapter 13 of the Bankruptcy Code. An order for relief has\\nbeen entered.\\nThis notice has important information about the case for creditors, debtors, and trustees, including information about\\nthe meeting of creditors and deadlines. Read both pages carefully.\\nThe filing of the case imposed an automatic stay against most collection activities. This means that creditors generally may not take action to collect debts\\nfrom the debt'"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering\n",
    "import tensorflow as tf\n",
    "\n",
    "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-cased-distilled-squad\")\n",
    "model = TFDistilBertForQuestionAnswering.from_pretrained(\"distilbert-base-cased-distilled-squad\")\n",
    "\n",
    "question = \"What is the fifth Social Security number or ITIN in the text?\"\n",
    "inputs = tokenizer(question, text_all[:3000], return_tensors=\"tf\")\n",
    "outputs = model(**inputs)\n",
    "answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])\n",
    "answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])\n",
    "predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]\n",
    "tokenizer.decode(predict_answer_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = inputs = tokenizer(question, text, return_tensors=\"tf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids = inputs['input_ids'].numpy()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "708"
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks = []\n",
    "for start in range(0, len(input_ids), 512 - 2):  # -2 for [CLS] and [SEP]\n",
    "    end = min(start + 512 - 2, len(input_ids))\n",
    "    chunk = input_ids[start:end]\n",
    "    chunks.append(chunk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "198"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk_text(text, tokenizer, max_length=512):\n",
    "    # Tokenize the text to get input IDs\n",
    "    inputs = tokenizer(question, text, return_tensors=\"tf\")\n",
    "    ##tokenizer(text, return_tensors=\"tf\", max_length=max_length, truncation=True, padding='max_length')\n",
    "    input_ids = inputs['input_ids'].numpy()[0]\n",
    "    \n",
    "    # Chunk the text\n",
    "    chunks = []\n",
    "    for start in range(0, len(input_ids), max_length - 2):  # -2 for [CLS] and [SEP]\n",
    "        end = min(start + max_length - 2, len(input_ids))\n",
    "        chunk = input_ids[start:end]\n",
    "        chunks.append(chunk)\n",
    "    \n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk_text(text, tokenizer, max_length=512):\n",
    "    # Tokenize the text into chunks\n",
    "    tokens = tokenizer(text, add_special_tokens=False, return_tensors=None)['input_ids']\n",
    "    chunks = []\n",
    "    for start in range(0, len(tokens), max_length - 20):  # -2 for [CLS] and [SEP]\n",
    "        end = min(start + max_length - 20, len(tokens))\n",
    "        chunk_tokens = tokens[start:end]\n",
    "        chunk_text = tokenizer.convert_ids_to_tokens(chunk_tokens, skip_special_tokens=True)\n",
    "        chunk_text = tokenizer.convert_tokens_to_string(chunk_text)\n",
    "        chunks.append(chunk_text)\n",
    "    \n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"Information to identify the case : Debtor 1 Cristina Nelson First Name Middle Name Last NameSocial Security number or ITIN 461−81−0513 EIN _ _ − _ _ _ _ _ _ _ Debtor 2 ( Spouse , if filing ) Timothy Nelson First Name Middle Name Last NameSocial Security number or ITIN 529−97−1200 EIN _ _ − _ _ _ _ _ _ _ United States Bankruptcy Court District of UtahDate case filed for chapter 13 : 8 / 7 / 24Case number : 24−23963 JTM Official Form 309I Notice of Chapter 13 Bankruptcy Case 10 / 20 For the debtors listed above , a case has been filed under chapter 13 of the Bankruptcy Code . An order for relief has been entered . This notice has important information about the case for creditors , debtors , and trustees , including information about the meeting of creditors and deadlines . Read both pages carefully . The filing of the case imposed an automatic stay against most collection activities . This means that creditors generally may not take action to collect debts from the debtors , the debtors ' property , and certain codebtors . For example , while the stay is in effect , creditors cannot sue , garnish wages , assert a deficiency , repossess property , or otherwise try to collect from the debtors . Creditors cannot demand repayment from debtors by mail , phone , or otherwise . Creditors who violate the stay can be required to pay actual and punitive damages and attorney ' s fees . Under certain circumstances , the stay may be limited to 30 days or not exist at all , although debtors can ask the court to extend or impose a stay . Confirmation of a chapter 13 plan may result in a discharge . Creditors who assert that the debtors are not entitled to a discharge under 11 U . S . C . § 1328 ( f ) must file a motion objecting to discharge in the bankruptcy clerk ' s office within the deadline specified in this notice . Creditors who want to have their debt excepted from discharge may be required to file a complaint in the bankruptcy clerk ' s office by the same deadline . ( See line 13 below for more information . ) To protect your rights , consult an attorney . All documents filed in the case may be inspected at\",\n",
       " \"the bankruptcy clerk ' s office at the address listed below or through PACER ( Public Access to Court Electronic Records at https : / / pacer . uscourts . gov ) . Case status information is available at no charge by calling the Voice Case Information System ( VCIS ) at 1−866−222−8029 . The staff of the bankruptcy clerk ' s office cannot give legal advice . To help creditors correctly identify debtors , debtors submit full Social Security or Individual Taxpayer Identification Numbers , which may appear on a version of this notice . However , the full numbers must not appear on any document filed with the court . Do not file this notice with any proof of claim or other filing in the case . Do not include more than the last four digits of a Social Security or Individual Taxpayer Identification Number in any document , including attachments , that you file with the court . About Debtor 1 : About Debtor 2 : 1 . Debtor ' s full name Cristina Nelson Timothy Nelson 2 . All other names used in the last 8 years 3 . Address3342 N 1825 E Layton , UT 840401788 E 3350 N Layton , UT 84040 4 . Debtor ' s attorney Name and addressE . Kent Winward 4850 Harrison Blvd . Suite 1 Ogden , UT 84403Contact phone ( 801 ) 392−8200 Email : utahbankruptcyfirm @ gmail . com 5 . Bankruptcy trustee Name and addressLon Jenkins tr Ch . 13 Trustee ' s Office 465 South 400 East Suite 200 Salt Lake City , UT 84111Contact phone 801−596−2884 Email : utahtrusteemail @ ch13ut . org 6 . Bankruptcy clerk ' s office Documents in this case may be filed at this address . You may inspect all records filed in this case at this office or online at https : / / pacer . uscourts . gov . United States Bankruptcy Court District of Utah 350 South Main # 301 Salt Lake City , UT 84101 Clerk of Court : David A . SimeHours open : 8 : 00 AM to 4 : 30 PM , Monday − Friday Contact phone : ( 80\",\n",
       " \"##1 ) 524−6687 Website : www . utb . uscourts . gov Official Form 309I Notice of Chapter 13 Bankruptcy Case page 1 Date Generated : 8 / 14 / 24 For more information , see page 2 > 1 / 2 Debtor Cristina Nelson and Timothy Nelson Case number 24−23963 7 . Meeting of creditors Debtors must attend the meeting to be questioned under oath . In a joint case , both spouses must attend . Creditors may attend , but are not required to do so . The meeting may be continued or adjourned to a later date . All individual debtor ( s ) must provide picture identification and proof of social security number to the trustee . Failure to do so may result in your case being dismissed . Meeting to be held on : Sep . 5 , 2024 at 11 : 00 AM Location : Zoom video meeting . Go to Zoom . us / join , Enter Meeting ID 437 037 6107 , and Passcode 3375693596 , OR call 1−385−832−9074 For additional meeting information go to https : / / www . justice . gov / ust / moc 8 . Deadlines The bankruptcy clerk ' s office must receive these documents and any required filing fee by the following deadlines . Deadline to file a complaint to challenge dischargeability of certain debts : Filing deadline : 11 / 4 / 24 You must file : • a motion if you assert that the debtors are not entitled to receive a discharge under U . S . C . § 1328 ( f ) , or • a complaint if you want to have a particular debt excepted from discharge under 11 U . S . C . § 523 ( a ) ( 2 ) or ( 4 ) . Deadline for all creditors to file a proof of claim ( except governmental units ) : Filing deadline : 10 / 16 / 24 Deadline for governmental units to file a proof of claim : Filing deadline : 2 / 3 / 25 Deadlines for filing proof of claim : A proof of claim is a signed statement describing a creditor ' s claim . A proof of claim form may be obtained at www . uscourts . gov or any bankruptcy clerk ' s office . If you do not file a proof of claim by the deadline , you might not be\",\n",
       " 'paid on your claim . To be paid , you must file a proof of claim even if your claim is listed in the schedules that the debtor filed . Secured creditors retain rights in their collateral regardless of whether they file a proof of claim . Filing a proof of claim submits the creditor to the jurisdiction of the bankruptcy court , with consequences a lawyer can explain . For example , a secured creditor who files a proof of claim may surrender important nonmonetary rights , including the right to a jury trial . Deadline to object to exemptions : The law permits debtors to keep certain property as exempt . If you believe that the law does not authorize an exemption claimed , you may file an objection . Filing deadline : 30 days after the conclusion of the meeting of creditors 9 . Filing of plan and confirmation hearing on docket Objections to ConfirmationThe debtor has filed a plan . The hearing on confirmation will be held on : 10 / 10 / 24 at 10 : 00 AM Location : This meeting is by Zoom . Go to , ZoomGov . com / join or call 1 + ( 669 ) 254−5252 , Enter Meeting ID 161 5478 8875 , Passcode 3834658 Objections to confirmation must be filed and served no later than 7 days before the date set for confirmation . If there are no timely filed objections to confirmation pending or if all objections to confirmation are resolved by a court order or a stipulation signed by the debtor , the trustee and the objecting party , a plan may be confirmed without objection , and the hearing stricken . 10 . Creditors with a foreign addressIf you are a creditor receiving a notice mailed to a foreign address , you may file a motion asking the court to extend the deadline in this notice . Consult an attorney familiar with United States bankruptcy law if you have any questions about your rights in this case . 11 . Filing a chapter 13 bankruptcy caseChapter 13 allows an individual with regular income and debts below a specified amount to adjust debts according to a plan . A plan is not effective unless the court confirms it . You may object to confirmation of the plan and appear at the confirmation hearing . A copy of the plan , if not enclosed , will be sent to you later , and if the confirmation hearing is not indicated on this notice , you will',\n",
       " \"be sent notice of the confirmation hearing . The debtor will remain in possession of the property and may continue to operate the business , if any , unless the court orders otherwise . 12 . Exempt property The law allows debtors to keep certain property as exempt . Fully exempt property will not be sold and distributed to creditors , even if the case is converted to chapter 7 . Debtors must file a list of property claimed as exempt . You may inspect that list at the bankruptcy clerk ' s office or online at https : / / pacer . uscourts . gov . If you believe that the law does not authorize an exemption that debtors claimed , you may file an objection by the deadline . 13 . Discharge of debts Confirmation of a chapter 13 plan may result in a discharge of debts , which may include all or part of a debt . However , unless the court orders otherwise , the debts will not be discharged until all payments under the plan are made . A discharge means that creditors may never try to collect the debt from the debtors personally except as provided in the plan . If you want to have a particular debt excepted from discharge under 11 U . S . C . § 523 ( a ) ( 2 ) or ( 4 ) , you must file a complaint and pay the filing fee in the bankruptcy clerk ' s office by the deadline . If you believe that the debtors are not entitled to a discharge of any of their debts under 11 U . S . C . § 1328 ( f ) , you must file a motion by the deadline . Official Form 309I Notice of Chapter 13 Bankruptcy Case page 2Powered by TCPDF ( www . tcpdf . org ) 2 / 2\"]"
      ]
     },
     "execution_count": 192,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_text(text_all, tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks = chunk_text(text_all, tokenizer)\n",
    "start_logits = []\n",
    "end_logits = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 197,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(chunks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'paid on your claim . To be paid , you must file a proof of claim even if your claim is listed in the schedules that the debtor filed . Secured creditors retain rights in their collateral regardless of whether they file a proof of claim . Filing a proof of claim submits the creditor to the jurisdiction of the bankruptcy court , with consequences a lawyer can explain . For example , a secured creditor who files a proof of claim may surrender important nonmonetary rights , including the right to a jury trial . Deadline to object to exemptions : The law permits debtors to keep certain property as exempt . If you believe that the law does not authorize an exemption claimed , you may file an objection . Filing deadline : 30 days after the conclusion of the meeting of creditors 9 . Filing of plan and confirmation hearing on docket Objections to ConfirmationThe debtor has filed a plan . The hearing on confirmation will be held on : 10 / 10 / 24 at 10 : 00 AM Location : This meeting is by Zoom . Go to , ZoomGov . com / join or call 1 + ( 669 ) 254−5252 , Enter Meeting ID 161 5478 8875 , Passcode 3834658 Objections to confirmation must be filed and served no later than 7 days before the date set for confirmation . If there are no timely filed objections to confirmation pending or if all objections to confirmation are resolved by a court order or a stipulation signed by the debtor , the trustee and the objecting party , a plan may be confirmed without objection , and the hearing stricken . 10 . Creditors with a foreign addressIf you are a creditor receiving a notice mailed to a foreign address , you may file a motion asking the court to extend the deadline in this notice . Consult an attorney familiar with United States bankruptcy law if you have any questions about your rights in this case . 11 . Filing a chapter 13 bankruptcy caseChapter 13 allows an individual with regular income and debts below a specified amount to adjust debts according to a plan . A plan is not effective unless the court confirms it . You may object to confirmation of the plan and appear at the confirmation hearing . A copy of the plan , if not enclosed , will be sent to you later , and if the confirmation hearing is not indicated on this notice , you will'"
      ]
     },
     "execution_count": 199,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunks[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'What are the Social Security numbers or ITIN in the text?'"
      ]
     },
     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = tokenizer(question, chunks[0], return_tensors=\"tf\")\n",
    "outputs = model(**inputs)\n",
    "start_logits.append(outputs.start_logits)\n",
    "end_logits.append(outputs.end_logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_logits.append(outputs.start_logits)\n",
    "end_logits.append(outputs.end_logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'What are the Social Security numbers or ITIN in the text?'"
      ]
     },
     "execution_count": 208,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answer(question, text_all):\n",
    "    chunks = chunk_text(text_all, tokenizer)\n",
    "    start_logits = []\n",
    "    end_logits = []\n",
    "\n",
    "    for chunk in chunks:\n",
    "        inputs = tokenizer(question, chunk, return_tensors=\"tf\", max_length=512, truncation=True)\n",
    "        outputs = model(**inputs)\n",
    "        start_logits.append(outputs.start_logits)\n",
    "        end_logits.append(outputs.end_logits)\n",
    "    \n",
    "    start_logits = tf.concat(start_logits, axis=1)\n",
    "    end_logits = tf.concat(end_logits, axis=1)\n",
    "    \n",
    "    print('1', start_logits)\n",
    "    print('2', end_logits)\n",
    "    # Find the best start and end indices\n",
    "    answer_start_index = int(tf.math.argmax(start_logits, axis=-1)[0])\n",
    "    answer_end_index = int(tf.math.argmax(end_logits, axis=-1)[0])\n",
    "    print('3', answer_start_index)\n",
    "    print('4', answer_end_index)\n",
    "    # Get the predicted answer\n",
    "    predict_answer_tokens = inputs[\"input_ids\"][0, answer_start_index : answer_end_index + 1]\n",
    "    return tokenizer.decode(predict_answer_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 tf.Tensor([[-1.251733  -7.9249597 -9.82178   ... -7.0328403 -4.5499053 -8.900514 ]], shape=(1, 2383), dtype=float32)\n",
      "2 tf.Tensor(\n",
      "[[ -0.8806853   -8.205478   -10.482227   ...  -6.0008802   -0.83891714\n",
      "   -9.523805  ]], shape=(1, 2383), dtype=float32)\n",
      "3 503\n",
      "4 503\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Example usage\n",
    "#question = \"What are is the third Social Security number or ITIN in the text?\"\n",
    "question = \"What is the Notice of Chapter?\"\n",
    "answer = get_answer(question, text_all)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answer(question, text_all):\n",
    "    chunks = chunk_text(text_all, tokenizer)\n",
    "    start_logits = []\n",
    "    end_logits = []\n",
    "    input_ids_all = []\n",
    "\n",
    "    for chunk in chunks:\n",
    "        # Tokenize the question and chunk text\n",
    "        inputs = tokenizer(question, chunk, return_tensors=\"tf\", max_length=512, truncation=True, padding='max_length')\n",
    "        outputs = model(**inputs)\n",
    "        \n",
    "        # Collect logits and input_ids for all chunks\n",
    "        start_logits.append(outputs.start_logits)\n",
    "        end_logits.append(outputs.end_logits)\n",
    "        input_ids_all.append(inputs[\"input_ids\"])\n",
    "\n",
    "    # Concatenate all logits and input_ids\n",
    "    start_logits = tf.concat(start_logits, axis=1)\n",
    "    end_logits = tf.concat(end_logits, axis=1)\n",
    "    input_ids_all = tf.concat(input_ids_all, axis=1)\n",
    "    \n",
    "    # Find the best start and end indices\n",
    "    answer_start_index = int(tf.math.argmax(start_logits, axis=-1)[0])\n",
    "    answer_end_index = int(tf.math.argmax(end_logits, axis=-1)[0])\n",
    "    \n",
    "    # Get the predicted answer\n",
    "    predict_answer_tokens = input_ids_all[0, answer_start_index : answer_end_index + 1]\n",
    "    return tokenizer.decode(predict_answer_tokens)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 290,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answer(question, text_all):\n",
    "    chunks = chunk_text(text_all, tokenizer)\n",
    "    start_logits = []\n",
    "    end_logits = []\n",
    "    input_ids_all = []\n",
    "    all_ands = []\n",
    "    for chunk in chunks:\n",
    "        # Tokenize the question and chunk text\n",
    "        inputs = tokenizer(question, chunk, return_tensors=\"tf\", max_length=512, truncation=True, padding='max_length')\n",
    "        outputs = model(**inputs)\n",
    "        #print(outputs)\n",
    "        answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])\n",
    "        answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])\n",
    "        predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]\n",
    "        print(tokenizer.decode(predict_answer_tokens),answer_start_index,answer_end_index)\n",
    "        all_ands.append(tokenizer.decode(predict_answer_tokens))\n",
    "\n",
    "        start_logits.append(outputs.start_logits)\n",
    "        end_logits.append(outputs.end_logits)\n",
    "        input_ids_all.append(inputs[\"input_ids\"])\n",
    "        print(len(start_logits[0][0]),len(end_logits[0][0]))\n",
    "    \n",
    "    # Concatenate all logits and input_ids\n",
    "    start_logits = tf.concat(start_logits, axis=1)\n",
    "    end_logits = tf.concat(end_logits, axis=1)\n",
    "    input_ids_all = tf.concat(input_ids_all, axis=1)\n",
    "    print('xx',start_logits)\n",
    "    print('x0',start_logits[0, 151].numpy())\n",
    "    print('x1',tf.math.argmax(start_logits, axis=-1))\n",
    "    # Find the best start and end indices\n",
    "    answer_start_index = int(tf.math.argmax(start_logits, axis=-1)[0])\n",
    "    answer_end_index = int(tf.math.argmax(end_logits, axis=-1)[0])\n",
    "    # Get the predicted answer\n",
    "    predict_answer_tokens = input_ids_all[0, answer_start_index : answer_end_index + 1]\n",
    "    print('here',tokenizer.decode(predict_answer_tokens),answer_start_index,answer_end_index)\n",
    "\n",
    "    return all_ands\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answer(question, text_all):\n",
    "    chunks = chunk_text(text_all, tokenizer)\n",
    "    answer_start_indexs = []\n",
    "    answer_end_indexs = []\n",
    "    input_ids_all = []\n",
    "    all_ands = []\n",
    "    for chunk in chunks:\n",
    "        # Tokenize the question and chunk text\n",
    "        inputs = tokenizer(question, chunk, return_tensors=\"tf\", max_length=512, truncation=True, padding='max_length')\n",
    "        outputs = model(**inputs)\n",
    "        #print(outputs)\n",
    "        answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])\n",
    "        answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])\n",
    "        if answer_start_index != 0 and answer_end_index != 0:\n",
    "            answer_start_indexs.append(answer_start_index)\n",
    "            answer_end_indexs.append(answer_end_index)\n",
    "            break\n",
    "        #predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]\n",
    "        #print(tokenizer.decode(predict_answer_tokens),answer_start_index,answer_end_index)\n",
    "        #all_ands.append(tokenizer.decode(predict_answer_tokens))\n",
    "    print(answer_start_indexs[0])\n",
    "    print(answer_end_indexs[0])\n",
    "    predict_answer_tokens = inputs.input_ids[0, answer_start_indexs[0] : answer_end_indexs[0] + 1]\n",
    "    print(tokenizer.decode(predict_answer_tokens),answer_start_index,answer_end_index)\n",
    "    return all_ands\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "174\n",
      "174\n",
      "dismissed 174 174\n",
      "[]\n"
     ]
    }
   ],
   "source": [
    "# Example usage\n",
    "#text_all = \"...\"  # Your long text here\n",
    "question = \"Which is the bankruptcy stage in the text petitioned,dismissed or discharged?\"\n",
    "#\"which chapter of bankruptcy is referenced in the text?\"\n",
    "#\"Which is the bankruptcy stage in the text petitioned,dismissed or discharged?\"\n",
    "#\"which is the country?\"\n",
    "#\"Which date case was filed?\"\n",
    "#\"which is the Bankruptcy Court district?\"\n",
    "#\"which is the country?\"\n",
    "#\"Which is the bankruptcy stage in the text petitioned,dismissed or discharged?\"\n",
    "#\"What is the second Social Security number or ITIN in the text?\"\n",
    "#\"which chapter of bankruptcy is referenced in the text?\"\n",
    "#\"What is the third Social Security number or ITIN in the text?\"\n",
    "answer = get_answer(question, text_all)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS] 0 0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "['[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]']\n"
     ]
    }
   ],
   "source": [
    "# Example usage\n",
    "#text_all = \"...\"  # Your long text here\n",
    "question = \"which bankruptcy stages does the text refer: petitioned,dismissed/withdrawn or discharged?\"\n",
    "#\"What is the third Social Security number or ITIN in the text?\"\n",
    "answer = get_answer(question, text_all)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 250,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS] which country and state? [SEP] Information to identify the case : Debtor 1 Cristina Nelson First Name Middle Name Last NameSocial Security number or ITIN 461−81−0513 EIN _ _ − _ _ _ _ _ _ _ Debtor 2 ( Spouse, if filing ) Timothy Nelson First Name Middle Name Last NameSocial Security number or ITIN 529−97−1200 EIN _ _ − _ _ _ _ _ _ _ United States 0 103\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "['[CLS] which country and state? [SEP] Information to identify the case : Debtor 1 Cristina Nelson First Name Middle Name Last NameSocial Security number or ITIN 461−81−0513 EIN _ _ − _ _ _ _ _ _ _ Debtor 2 ( Spouse, if filing ) Timothy Nelson First Name Middle Name Last NameSocial Security number or ITIN 529−97−1200 EIN _ _ − _ _ _ _ _ _ _ United States', '[CLS]', '[CLS]', '[CLS]', '[CLS]']\n"
     ]
    }
   ],
   "source": [
    "# Example usage\n",
    "#text_all = \"...\"  # Your long text here\n",
    "question3 = \"which country and state?\"\n",
    "#\"What is the third Social Security number or ITIN in the text?\"\n",
    "answer = get_answer(question3, text_all)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\" \\nInformation to identify the case:\\nDebtor 1 Cristina Nelson\\nFirst Name      Middle Name      Last NameSocial Security number or ITIN    461−81−0513\\nEIN    _ _−_ _ _ _ _ _ _\\nDebtor 2\\n(Spouse, if filing)Timothy Nelson\\nFirst Name      Middle Name      Last NameSocial Security number or ITIN    529−97−1200\\nEIN    _ _−_ _ _ _ _ _ _\\nUnited States Bankruptcy Court   District of UtahDate case filed for chapter  13:   8/7/24Case number:   24−23963   JTM\\nOfficial Form 309I\\nNotice of Chapter 13 Bankruptcy Case 10/20\\nFor the debtors listed above, a case has been filed under chapter 13 of the Bankruptcy Code. An order for relief has\\nbeen entered.\\nThis notice has important information about the case for creditors, debtors, and trustees, including information about\\nthe meeting of creditors and deadlines. Read both pages carefully.\\nThe filing of the case imposed an automatic stay against most collection activities. This means that creditors generally may not take action to collect debts\\nfrom the debtors, the debtors' property, and certain codebtors. For example, while the stay is in effect, creditors cannot sue, garnish wages, assert a\\ndeficiency, repossess property, or otherwise try to collect from the debtors. Creditors cannot demand repayment from debtors by mail, phone, or\\notherwise. Creditors who violate the stay can be required to pay actual and punitive damages and attorney's fees. Under certain circumstances, the stay\\nmay be limited to 30 days or not exist at all, although debtors can ask the court to extend or impose a stay.\\nConfirmation of a chapter 13 plan may result in a discharge. Creditors who assert that the debtors are not entitled to a discharge under 11 U.S.C. §\\n1328(f) must file a motion objecting to discharge in the bankruptcy clerk's office within the deadline specified in this notice. Creditors who want to have\\ntheir debt excepted from discharge may be required to file a complaint in the bankruptcy clerk's office by the same deadline. (See line 13 below for more\\ninformation.)\\nTo protect your rights, consult an attorney. All documents filed in the case may be inspected at the bankruptcy clerk's office at the address listed below or\\nthrough PACER (Public Access to Court Electronic Records at https://pacer.uscourts.gov). Case status information is available at no charge by calling the\\nVoice Case Information System (VCIS) at 1−866−222−8029.\\nThe staff of the bankruptcy clerk's office cannot give legal advice.\\nTo help creditors correctly identify debtors, debtors submit full Social Security or Individual Taxpayer Identification\\nNumbers, which may appear on a version of this notice. However, the full numbers must not appear on any document\\nfiled with the court.\\nDo not file this notice with any proof of claim or other filing in the case. Do not include more than the last four digits of\\na Social Security or Individual Taxpayer Identification Number in any document, including attachments, that you file\\nwith the court.\\nAbout Debtor 1: About Debtor 2:\\n1. Debtor's full name Cristina Nelson Timothy Nelson\\n2.All other names used in the\\nlast 8 years\\n3. Address3342 N 1825 E\\nLayton, UT 840401788 E 3350 N\\nLayton, UT 84040\\n4.Debtor's attorney\\nName and addressE. Kent Winward\\n4850 Harrison Blvd. Suite 1\\nOgden, UT 84403Contact phone (801) 392−8200\\nEmail:  utahbankruptcyfirm@gmail.com\\n5. Bankruptcy trustee\\nName and addressLon Jenkins tr\\nCh. 13 Trustee's Office\\n465 South 400 East\\nSuite 200\\nSalt Lake City, UT 84111Contact phone 801−596−2884\\nEmail:  utahtrusteemail@ch13ut.org\\n6. Bankruptcy clerk's office\\nDocuments in this case may be filed\\nat this address.\\nYou may inspect all records filed in\\nthis case at this office or online at\\nhttps://pacer.uscourts.gov.United States Bankruptcy Court\\nDistrict of Utah\\n350 South Main #301\\nSalt Lake City, UT 84101\\nClerk of Court: David A. SimeHours open: 8:00 AM to 4:30 PM, Monday −\\nFriday\\nContact phone: (801) 524−6687\\nWebsite: www.utb.uscourts.gov\\nOfficial Form 309I    Notice of Chapter 13 Bankruptcy Case page 1\\nDate Generated: 8/14/24 For more information, see page 2 >\\n                               1 / 2 \\nDebtor  Cristina Nelson   and  Timothy Nelson Case number 24−23963\\n7. Meeting of creditors\\n Debtors must attend the meeting to\\nbe questioned under oath. In a joint\\ncase, both spouses must attend.\\nCreditors may attend, but are not\\nrequired to do so.The meeting may be continued or adjourned\\nto a later date.\\nAll individual debtor(s) must provide picture\\nidentification and proof of social security\\nnumber to the trustee. Failure to do so may\\nresult in your case being dismissed.Meeting to be held on:\\nSep. 5, 2024  at 11:00 AM\\nLocation:\\nZoom video meeting. Go to Zoom.us/join,\\nEnter Meeting ID 437 037 6107, and\\nPasscode 3375693596, OR call\\n1−385−832−9074\\nFor additional meeting information go to https://www.justice.gov/ust/moc\\n8. Deadlines\\nThe bankruptcy clerk's office must\\nreceive these documents and any\\nrequired filing fee by the following\\ndeadlines.Deadline to file a complaint to challenge\\ndischargeability of certain debts:Filing deadline: 11/4/24\\nYou must file:\\n•   a motion if you assert that the debtors are not entitled to receive a discharge\\n    under U.S.C. § 1328(f), or\\n•   a complaint if you want to have a particular debt excepted from discharge\\n    under 11 U.S.C. § 523(a)(2) or (4).\\nDeadline for all creditors to file a proof of claim\\n(except governmental units):Filing deadline: 10/16/24\\nDeadline for governmental units to file a proof of\\nclaim:Filing deadline: 2/3/25\\nDeadlines for filing proof of claim:\\n A proof of claim is a signed statement describing a creditor's claim. A proof of claim form may be obtained at\\nwww.uscourts.gov or any bankruptcy clerk's office.\\nIf you do not file a proof of claim by the deadline, you might not be paid on your claim. To be paid, you must file a\\nproof of claim even if your claim is listed in the schedules that the debtor filed.\\nSecured creditors retain rights in their collateral regardless of whether they file a proof of claim. Filing a proof of\\nclaim submits the creditor to the jurisdiction of the bankruptcy court, with consequences a lawyer can explain. For\\nexample, a secured creditor who files a proof of claim may surrender important nonmonetary rights, including the\\nright to a jury trial.\\nDeadline to object to exemptions:\\nThe law permits debtors to keep certain property as exempt. If\\nyou believe that the law does not authorize an exemption\\nclaimed, you may file an objection.Filing\\ndeadline:30 days after the\\nconclusion  of the\\nmeeting of\\ncreditors\\n9. Filing of plan and\\nconfirmation hearing on\\ndocket\\nObjections to ConfirmationThe debtor has filed a plan.\\nThe hearing on confirmation will be held on:  10/10/24  at 10:00 AM\\nLocation:  This meeting is by Zoom. Go to, ZoomGov.com/join or call\\n1+(669)254−5252, Enter Meeting ID 161 5478 8875, Passcode 3834658\\n Objections to confirmation must be filed and served no later than 7 days before the date set for confirmation. If\\nthere are no timely filed objections to confirmation pending or if all objections to confirmation are resolved by a\\ncourt order or a stipulation signed by the debtor, the trustee and the objecting party, a plan may be confirmed\\nwithout objection, and the hearing stricken.\\n10. Creditors with a foreign\\naddressIf you are a creditor receiving a notice mailed to a foreign address, you may file a motion asking the court to\\nextend the deadline in this notice. Consult an attorney familiar with United States bankruptcy law if you have any\\nquestions about your rights in this case.\\n11. Filing a chapter 13\\nbankruptcy caseChapter 13 allows an individual with regular income and debts below a specified amount to adjust debts\\naccording to a plan. A plan is not effective unless the court confirms it. You may object to confirmation of the\\nplan and appear at the confirmation hearing. A copy of the plan, if not enclosed, will be sent to you later, and if\\nthe confirmation hearing is not indicated on this notice, you will be sent notice of the confirmation hearing. The\\ndebtor will remain in possession of the property and may continue to operate the business, if any, unless the\\ncourt orders otherwise.\\n12. Exempt property The law allows debtors to keep certain property as exempt. Fully exempt property will not be sold and distributed\\nto creditors, even if the case is converted to chapter 7. Debtors must file a list of property claimed as exempt.\\nYou may inspect that list at the bankruptcy clerk's office or online at https://pacer.uscourts.gov. If you believe\\nthat the law does not authorize an exemption that debtors claimed, you may file an objection by the deadline.\\n13. Discharge of debts Confirmation of a chapter 13 plan may result in a discharge of debts, which may include all or part of a debt.\\nHowever, unless the court orders otherwise, the debts will not be discharged until all payments under the plan\\nare made. A discharge means that creditors may never try to collect the debt from the debtors personally except\\nas provided in the plan. If you want to have a particular debt excepted from discharge under 11 U.S.C. §\\n523(a)(2) or (4), you must file a complaint and pay the filing fee in the bankruptcy clerk's office by the deadline. If\\nyou believe that the debtors are not entitled to a discharge of any of their debts under 11 U.S.C. § 1328(f), you\\nmust file a motion by the deadline.\\nOfficial Form 309I    Notice of Chapter 13 Bankruptcy Case page 2Powered by TCPDF (www.tcpdf.org)\\n                               2 / 2\\n\""
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9489"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(text_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-09-12 09:24:06.178737: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: indices[0,512] = 512 is not in [0, 512)\n"
     ]
    },
    {
     "ename": "InvalidArgumentError",
     "evalue": "Exception encountered when calling layer 'embeddings' (type TFEmbeddings).\n\n{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[0,512] = 512 is not in [0, 512) [Op:ResourceGather] name: \n\nCall arguments received by layer 'embeddings' (type TFEmbeddings):\n  • input_ids=tf.Tensor(shape=(1, 709), dtype=int32)\n  • position_ids=None\n  • inputs_embeds=None\n  • training=False",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mInvalidArgumentError\u001b[0m                      Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[131], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m inputs \u001b[38;5;241m=\u001b[39m tokenizer(question, text_all[:\u001b[38;5;241m3000\u001b[39m], return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/tf_keras/src/utils/traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     67\u001b[0m     filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m     68\u001b[0m     \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m     69\u001b[0m     \u001b[38;5;66;03m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m---> 70\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m     71\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     72\u001b[0m     \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/modeling_tf_utils.py:437\u001b[0m, in \u001b[0;36munpack_inputs.<locals>.run_call_with_unpacked_inputs\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    434\u001b[0m     config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\n\u001b[1;32m    436\u001b[0m unpacked_inputs \u001b[38;5;241m=\u001b[39m input_processing(func, config, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfn_args_and_kwargs)\n\u001b[0;32m--> 437\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43munpacked_inputs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py:1091\u001b[0m, in \u001b[0;36mTFDistilBertForQuestionAnswering.call\u001b[0;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict, start_positions, end_positions, training)\u001b[0m\n\u001b[1;32m   1061\u001b[0m \u001b[38;5;129m@unpack_inputs\u001b[39m\n\u001b[1;32m   1062\u001b[0m \u001b[38;5;129m@add_start_docstrings_to_model_forward\u001b[39m(DISTILBERT_INPUTS_DOCSTRING\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbatch_size, sequence_length\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m   1063\u001b[0m \u001b[38;5;129m@add_code_sample_docstrings\u001b[39m(\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1079\u001b[0m     training: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m   1080\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[TFQuestionAnsweringModelOutput, Tuple[tf\u001b[38;5;241m.\u001b[39mTensor]]:\n\u001b[1;32m   1081\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1082\u001b[0m \u001b[38;5;124;03m    start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[1;32m   1083\u001b[0m \u001b[38;5;124;03m        Labels for position (index) of the start of the labelled span for computing the token classification loss.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1089\u001b[0m \u001b[38;5;124;03m        are not taken into account for computing the loss.\u001b[39;00m\n\u001b[1;32m   1090\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1091\u001b[0m     distilbert_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdistilbert\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1092\u001b[0m \u001b[43m        \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1093\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1094\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1095\u001b[0m \u001b[43m        \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1096\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1097\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1098\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1099\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtraining\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtraining\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1100\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1101\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m distilbert_output[\u001b[38;5;241m0\u001b[39m]  \u001b[38;5;66;03m# (bs, max_query_len, dim)\u001b[39;00m\n\u001b[1;32m   1102\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(hidden_states, training\u001b[38;5;241m=\u001b[39mtraining)  \u001b[38;5;66;03m# (bs, max_query_len, dim)\u001b[39;00m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/modeling_tf_utils.py:437\u001b[0m, in \u001b[0;36munpack_inputs.<locals>.run_call_with_unpacked_inputs\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    434\u001b[0m     config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\n\u001b[1;32m    436\u001b[0m unpacked_inputs \u001b[38;5;241m=\u001b[39m input_processing(func, config, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfn_args_and_kwargs)\n\u001b[0;32m--> 437\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43munpacked_inputs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py:454\u001b[0m, in \u001b[0;36mTFDistilBertMainLayer.call\u001b[0;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict, training)\u001b[0m\n\u001b[1;32m    451\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    452\u001b[0m     head_mask \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;01mNone\u001b[39;00m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_hidden_layers\n\u001b[0;32m--> 454\u001b[0m embedding_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m    455\u001b[0m tfmr_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer(\n\u001b[1;32m    456\u001b[0m     embedding_output,\n\u001b[1;32m    457\u001b[0m     attention_mask,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    462\u001b[0m     training\u001b[38;5;241m=\u001b[39mtraining,\n\u001b[1;32m    463\u001b[0m )\n\u001b[1;32m    465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfmr_output\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py:117\u001b[0m, in \u001b[0;36mTFEmbeddings.call\u001b[0;34m(self, input_ids, position_ids, inputs_embeds, training)\u001b[0m\n\u001b[1;32m    114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m position_ids \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    115\u001b[0m     position_ids \u001b[38;5;241m=\u001b[39m tf\u001b[38;5;241m.\u001b[39mexpand_dims(tf\u001b[38;5;241m.\u001b[39mrange(start\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, limit\u001b[38;5;241m=\u001b[39minput_shape[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]), axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m--> 117\u001b[0m position_embeds \u001b[38;5;241m=\u001b[39m \u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgather\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    118\u001b[0m final_embeddings \u001b[38;5;241m=\u001b[39m inputs_embeds \u001b[38;5;241m+\u001b[39m position_embeds\n\u001b[1;32m    119\u001b[0m final_embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mLayerNorm(inputs\u001b[38;5;241m=\u001b[39mfinal_embeddings)\n",
      "\u001b[0;31mInvalidArgumentError\u001b[0m: Exception encountered when calling layer 'embeddings' (type TFEmbeddings).\n\n{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[0,512] = 512 is not in [0, 512) [Op:ResourceGather] name: \n\nCall arguments received by layer 'embeddings' (type TFEmbeddings):\n  • input_ids=tf.Tensor(shape=(1, 709), dtype=int32)\n  • position_ids=None\n  • inputs_embeds=None\n  • training=False"
     ]
    }
   ],
   "source": [
    "inputs = tokenizer(question, text_all[:3000], return_tensors=\"tf\")\n",
    "outputs = model(**inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "''"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])\n",
    "answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])\n",
    "predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]\n",
    "tokenizer.decode(predict_answer_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForQuestionAnswering: ['roberta.embeddings.position_ids']\n",
      "- This IS expected if you are initializing TFRobertaForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing TFRobertaForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "All the weights of TFRobertaForQuestionAnswering were initialized from the PyTorch model.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForQuestionAnswering for predictions without further training.\n",
      "/Users/camilayepes/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "ename": "ImportError",
     "evalue": "\nAutoModelForQuestionAnswering requires the PyTorch library but it was not found in your environment.\nHowever, we were able to find a TensorFlow installation. TensorFlow classes begin\nwith \"TF\", but are otherwise identically named to our PyTorch classes. This\nmeans that the TF equivalent of the class you tried to import would be \"TFAutoModelForQuestionAnswering\".\nIf you want to use TensorFlow, please use TF classes instead!\n\nIf you really do want to use PyTorch please go to\nhttps://pytorch.org/get-started/locally/ and follow the instructions that\nmatch your environment.\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[56], line 14\u001b[0m\n\u001b[1;32m     11\u001b[0m res \u001b[38;5;241m=\u001b[39m nlp(QA_input)\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# b) Load model & tokenizer\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModelForQuestionAnswering\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m(model_name)\n\u001b[1;32m     15\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_name)\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/utils/import_utils.py:1543\u001b[0m, in \u001b[0;36mDummyObject.__getattribute__\u001b[0;34m(cls, key)\u001b[0m\n\u001b[1;32m   1541\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m key \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_from_config\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m   1542\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__getattribute__\u001b[39m(key)\n\u001b[0;32m-> 1543\u001b[0m \u001b[43mrequires_backends\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_backends\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/utils/import_utils.py:1522\u001b[0m, in \u001b[0;36mrequires_backends\u001b[0;34m(obj, backends)\u001b[0m\n\u001b[1;32m   1520\u001b[0m \u001b[38;5;66;03m# Raise an error for users who might not realize that classes without \"TF\" are torch-only\u001b[39;00m\n\u001b[1;32m   1521\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m backends \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m backends \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_available() \u001b[38;5;129;01mand\u001b[39;00m is_tf_available():\n\u001b[0;32m-> 1522\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(PYTORCH_IMPORT_ERROR_WITH_TF\u001b[38;5;241m.\u001b[39mformat(name))\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;66;03m# Raise the inverse error for PyTorch users trying to load TF classes\u001b[39;00m\n\u001b[1;32m   1525\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m backends \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m backends \u001b[38;5;129;01mand\u001b[39;00m is_torch_available() \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_tf_available():\n",
      "\u001b[0;31mImportError\u001b[0m: \nAutoModelForQuestionAnswering requires the PyTorch library but it was not found in your environment.\nHowever, we were able to find a TensorFlow installation. TensorFlow classes begin\nwith \"TF\", but are otherwise identically named to our PyTorch classes. This\nmeans that the TF equivalent of the class you tried to import would be \"TFAutoModelForQuestionAnswering\".\nIf you want to use TensorFlow, please use TF classes instead!\n\nIf you really do want to use PyTorch please go to\nhttps://pytorch.org/get-started/locally/ and follow the instructions that\nmatch your environment.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n",
    "\n",
    "model_name = \"deepset/roberta-base-squad2\"\n",
    "\n",
    "# a) Get predictions\n",
    "nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)\n",
    "QA_input = {\n",
    "    'question': 'Why is model conversion important?',\n",
    "    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'\n",
    "}\n",
    "res = nlp(QA_input)\n",
    "\n",
    "# b) Load model & tokenizer\n",
    "model = AutoModelForQuestionAnswering.from_pretrained(model_name)\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = nlp(QA_input)\n",
    "print(res)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Optional\n",
    "\n",
    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "# Define a custom prompt to provide instructions and any additional context.\n",
    "# 1) You can add examples into the prompt template to improve extraction quality\n",
    "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n",
    "#    about the document from which the text was extracted.)\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\n",
    "            \"system\",\n",
    "            \"You are an expert extraction algorithm. \"\n",
    "            \"Only extract relevant information from the text. \"\n",
    "            \"If you do not know the value of an attribute asked to extract, \"\n",
    "            \"return null for the attribute's value.\",\n",
    "        ),\n",
    "        # Please see the how-to about improving performance with\n",
    "        # reference examples.\n",
    "        # MessagesPlaceholder('examples'),\n",
    "        (\"human\", \"{text}\"),\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Optional\n",
    "\n",
    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "# Define a custom prompt to provide instructions and any additional context.\n",
    "# 1) You can add examples into the prompt template to improve extraction quality\n",
    "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n",
    "#    about the document from which the text was extracted.)\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\n",
    "            \"system\",\n",
    "            \"You are an expert extraction algorithm. \"\n",
    "            \"Only extract relevant information from the text. \"\n",
    "            \"If you do not know the value of an attribute asked to extract, \"\n",
    "            \"return null for the attribute's value.\",\n",
    "        ),\n",
    "        # Please see the how-to about improving performance with\n",
    "        # reference examples.\n",
    "        # MessagesPlaceholder('examples'),\n",
    "        (\"human\", \"{text}\"),\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'langchain_mistralai'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_mistralai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatMistralAI\n\u001b[1;32m      3\u001b[0m llm \u001b[38;5;241m=\u001b[39m ChatMistralAI(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmistral-large-latest\u001b[39m\u001b[38;5;124m\"\u001b[39m, temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m      5\u001b[0m runnable \u001b[38;5;241m=\u001b[39m prompt \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mPerson)\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_mistralai'"
     ]
    }
   ],
   "source": [
    "from langchain_mistralai import ChatMistralAI\n",
    "\n",
    "llm = ChatMistralAI(model=\"mistral-large-latest\", temperature=0)\n",
    "\n",
    "runnable = prompt | llm.with_structured_output(schema=Person)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List, Optional\n",
    "\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "\n",
    "class Bankruptcy(BaseModel):\n",
    "    \"\"\"Information about a bankruptcy declaration.\"\"\"\n",
    "\n",
    "    # ^ Doc-string for the entity Person.\n",
    "    # This doc-string is sent to the LLM as the description of the schema Person,\n",
    "    # and it can help to improve extraction results.\n",
    "\n",
    "    # Note that:\n",
    "    # 1. Each field is an `optional` -- this allows the model to decline to extract it!\n",
    "    # 2. Each field has a `description` -- this description is used by the LLM.\n",
    "    # Having a good description can help improve extraction results.\n",
    "    ssns: Optional[list] = Field(default=None, description=\"The ssns of the persons\")\n",
    "    chapter: Optional[str] = Field(\n",
    "        default=None, description=\"The chapter of the bankruptcy declaration\"\n",
    "    )\n",
    "    country: Optional[str] = Field(\n",
    "        default=None, description=\"Country were the bankruptcy declaration is made\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Data(BaseModel):\n",
    "    \"\"\"Extracted data about bankruptcy declaration..\"\"\"\n",
    "\n",
    "    # Creates a model so that we can extract multiple entities.\n",
    "    people: List[Bankruptcy]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'prompt' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m runnable \u001b[38;5;241m=\u001b[39m \u001b[43mprompt\u001b[49m \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mData)\n\u001b[1;32m      2\u001b[0m runnable\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m: text_all})\n",
      "\u001b[0;31mNameError\u001b[0m: name 'prompt' is not defined"
     ]
    }
   ],
   "source": [
    "runnable = prompt | llm.with_structured_output(schema=Data)\n",
    "runnable.invoke({\"text\": text_all})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(text_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find SSNs\n",
    "ssn_pattern = r'\\b(?:Social Security number|ITIN)\\D*(\\d{3}[−\\s]\\d{2}[−\\s]\\d{4})\\b'\n",
    "ssns = re.findall(ssn_pattern, text_all)\n",
    "\n",
    "def find_ssns(text):\n",
    "    ssns = re.findall(ssn_pattern, text_all)\n",
    "    return ssns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find chapter\n",
    "chapter_pattern = r'Notice of Chapter (\\d+) Bankruptcy Case \\d{1,2}/\\d{2}'\n",
    "\n",
    "def find_chapter(text):\n",
    "    chapters = re.findall(chapter_pattern, text_all)\n",
    "    return chapters[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "country_code = {\"United States\": \"US\", \"Canada\":\"CA\"}\n",
    "\n",
    "country_pattern = r'\\b(?:United States|Canada)\\b'\n",
    "\n",
    "def find_country_code(text):\n",
    "    country_match = re.search(country_pattern, text, re.IGNORECASE)\n",
    "    return country_code.get(country_match[0],None) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find State\n",
    "state_pattern = r'\\nDistrict of (\\w+)'\n",
    "\n",
    "# Dictionaries for state codes\n",
    "us_states = {\n",
    "    \"Alabama\": \"AL\", \"Alaska\": \"AK\", \"Arizona\": \"AZ\", \"Arkansas\": \"AR\", \"California\": \"CA\",\n",
    "    \"Colorado\": \"CO\", \"Connecticut\": \"CT\", \"Delaware\": \"DE\", \"Florida\": \"FL\", \"Georgia\": \"GA\",\n",
    "    \"Hawaii\": \"HI\", \"Idaho\": \"ID\", \"Illinois\": \"IL\", \"Indiana\": \"IN\", \"Iowa\": \"IA\",\n",
    "    \"Kansas\": \"KS\", \"Kentucky\": \"KY\", \"Louisiana\": \"LA\", \"Maine\": \"ME\", \"Maryland\": \"MD\",\n",
    "    \"Massachusetts\": \"MA\", \"Michigan\": \"MI\", \"Minnesota\": \"MN\", \"Mississippi\": \"MS\", \"Missouri\": \"MO\",\n",
    "    \"Montana\": \"MT\", \"Nebraska\": \"NE\", \"Nevada\": \"NV\", \"New Hampshire\": \"NH\", \"New Jersey\": \"NJ\",\n",
    "    \"New Mexico\": \"NM\", \"New York\": \"NY\", \"North Carolina\": \"NC\", \"North Dakota\": \"ND\", \"Ohio\": \"OH\",\n",
    "    \"Oklahoma\": \"OK\", \"Oregon\": \"OR\", \"Pennsylvania\": \"PA\", \"Rhode Island\": \"RI\", \"South Carolina\": \"SC\",\n",
    "    \"South Dakota\": \"SD\", \"Tennessee\": \"TN\", \"Texas\": \"TX\", \"Utah\": \"UT\", \"Vermont\": \"VT\",\n",
    "    \"Virginia\": \"VA\", \"Washington\": \"WA\", \"West Virginia\": \"WV\", \"Wisconsin\": \"WI\", \"Wyoming\": \"WY\"\n",
    "}\n",
    "\n",
    "canadian_provinces = {\n",
    "    \"Alberta\": \"AB\", \"British Columbia\": \"BC\", \"Manitoba\": \"MB\", \"New Brunswick\": \"NB\", \"Newfoundland and Labrador\": \"NL\",\n",
    "    \"Northwest Territories\": \"NT\", \"Nova Scotia\": \"NS\", \"Nunavut\": \"NU\", \"Ontario\": \"ON\", \"Prince Edward Island\": \"PE\",\n",
    "    \"Quebec\": \"QC\", \"Saskatchewan\": \"SK\", \"Yukon\": \"YT\"\n",
    "}\n",
    "\n",
    "def find_state_code(text,country_code):\n",
    "    state_match = re.search(state_pattern, text)\n",
    "    \n",
    "    if state_match:\n",
    "        # Extract the state or province name from the match\n",
    "        state_name = state_match.group(1).strip()\n",
    "    \n",
    "    if country_code == 'US':\n",
    "        state_code = us_states.get(state_name,None)\n",
    "    elif country_code == 'CA':\n",
    "        state_code = canadian_provinces.get(state_name,None)\n",
    "    else:\n",
    "        state_code = None\n",
    "    \n",
    "    return state_code\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find stage\n",
    "stage_patterns = {\n",
    "    'Petition': r'\\b(case filed|petition filed|automatic stay)\\b',\n",
    "    'Discharge': r'\\b(discharge of debts|discharge order|case discharged)\\b',\n",
    "    'Dismissed': r'\\b(case dismissed|dismissal|converted to Chapter 7)\\b'\n",
    "}\n",
    "\n",
    "# Function to categorize bankruptcy stages from text\n",
    "def categorize_stage(text):\n",
    "    categorized_stages = {'Petition': False, 'Discharge': False, 'Dismissed': False}\n",
    "    \n",
    "    for stage, pattern in stage_patterns.items():\n",
    "        if re.search(pattern, text, re.IGNORECASE):\n",
    "            categorized_stages[stage] = True\n",
    "    \n",
    "    # Determine the final stage based on the presence of keywords\n",
    "    if categorized_stages['Petition']:\n",
    "        return 'Petition'\n",
    "    elif categorized_stages['Discharge']:\n",
    "        return 'Discharge'\n",
    "    elif categorized_stages['Dismissed']:\n",
    "        return 'Dismissed'\n",
    "    else:\n",
    "        return 'Unknown'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data found: {'ssns': ['461−81−0513', '529−97−1200'], 'chapter': '13', 'country_code': 'US', 'state': 'UT', 'stage': 'Petition'}\n"
     ]
    }
   ],
   "source": [
    "data = { \"ssns\": find_ssns(text_all),\n",
    "        \"chapter\": find_chapter(text_all),\n",
    "        \"country_code\": find_country_code(text_all),\n",
    "        \"state\": find_state_code(text_all, find_country_code(text_all)),\n",
    "        \"stage\": categorize_stage(text_all)\n",
    "        }\n",
    "\n",
    "print(f\"Data found: {data}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}