File size: 1,181 Bytes
d012361
 
1ade467
 
 
 
 
 
d012361
 
1ade467
d012361
 
 
 
 
 
 
 
 
6f88ba1
 
 
d012361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import json
import uuid
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=3000, chunk_overlap=0
)

def generate_uuid():
    return str(uuid.uuid4())

def check_id_extis_in_json(file_id):
    with open('file_ids.json', 'r') as f:
        file_ids = json.load(f)
    if file_id in file_ids:
        return True
    else:
        return False

def compare_paper_ids(data, paper_ids):
    existing_dois = {item['doi_no'] for item in data}
    missing_paper_ids = [paper_id for paper_id in paper_ids if paper_id not in existing_dois]
    return missing_paper_ids

def extract_json_from_text(text):
    text = str(text)
    # print("text",text)
    try:
        # Find the JSON part within the text
        start_index = text.find('{')
        end_index = text.rfind('}') + 1
        json_part = text[start_index:end_index]
        json_part = json.loads(json_part.lower())
        print("json",type(json_part))
        print(json_part)
        return json_part.get('data', [])

    except Exception as e:
        print(f"\033[31m Exception occurred while loading JSON: {str(e)} [0m")
        return text