NHLOCAL commited on
Commit
0e6fd2c
ยท
1 Parent(s): edc87b4

add talmud

Browse files
app.py CHANGED
@@ -6,15 +6,17 @@ import joblib
6
  nltk.download('punkt')
7
 
8
  # Load the trained model and vectorizer outside the function for better performance
9
- loaded_classifier = joblib.load("is_this_bible_model.pkl")
10
- vectorizer = joblib.load("is_this_bible_vectorizer.pkl")
11
 
12
  def parse_text(new_text):
13
  new_text_tfidf = vectorizer.transform([new_text])
14
  prediction = loaded_classifier.predict(new_text_tfidf)
15
  probabilities = loaded_classifier.predict_proba(new_text_tfidf)
16
- confidence_score = probabilities[0, 1]
17
- return 'ืชื "ืš' if prediction[0] == 1 else 'ืื—ืจ', confidence_score
 
 
18
 
19
  iface = gr.Interface(fn=parse_text, inputs="text", outputs=["text", "number"], title='ื’ื™ืœื•ื™ ืคืกื•ืงื™ ื”ืชื "ืš ื‘ืืžืฆืขื•ืช AI', description='ื”ื–ืŸ ื˜ืงืกื˜ ื›ื“ื™ ืœืกื•ื•ื’ ืื ื”ื•ื ืžื”ืชื "ืš ืื• ืœื.')
20
  iface.launch()
 
6
  nltk.download('punkt')
7
 
8
  # Load the trained model and vectorizer outside the function for better performance
9
+ loaded_classifier = joblib.load("bible_or_talmud_model.pkl")
10
+ vectorizer = joblib.load("bible_or_talmud_vectorizer.pkl")
11
 
12
  def parse_text(new_text):
13
  new_text_tfidf = vectorizer.transform([new_text])
14
  prediction = loaded_classifier.predict(new_text_tfidf)
15
  probabilities = loaded_classifier.predict_proba(new_text_tfidf)
16
+ confidence_score = max(probabilities[0])
17
+ labels = {0: 'ืื—ืจ', 1: 'ืชื "ืš', 2: 'ืชืœืžื•ื“ ื‘ื‘ืœื™'}
18
+ predicted_label = labels[prediction[0]]
19
+ return predicted_label, confidence_score
20
 
21
  iface = gr.Interface(fn=parse_text, inputs="text", outputs=["text", "number"], title='ื’ื™ืœื•ื™ ืคืกื•ืงื™ ื”ืชื "ืš ื‘ืืžืฆืขื•ืช AI', description='ื”ื–ืŸ ื˜ืงืกื˜ ื›ื“ื™ ืœืกื•ื•ื’ ืื ื”ื•ื ืžื”ืชื "ืš ืื• ืœื.')
22
  iface.launch()
data_creation/text_identification_model.pkl โ†’ bible_or_talmud_model.pkl RENAMED
File without changes
data_creation/text_identification_vectorizer.pkl โ†’ bible_or_talmud_vectorizer.pkl RENAMED
File without changes
try_model.py DELETED
@@ -1,74 +0,0 @@
1
- from sys import argv
2
- #import re
3
- import nltk
4
- from nltk.corpus import stopwords
5
- import joblib
6
-
7
-
8
- """
9
- # Remove punctuation and special characters
10
- def remove_punctuation(text):
11
- return re.sub(r'[^\w\s]', '', text)
12
-
13
- # Function to remove custom stop words from text
14
- def remove_custom_stopwords(text):
15
- hebrew_stopwords = set(stopwords.words('hebrew'))
16
- additional_stopwords = {'ืื ื™', 'ืืชื”', 'ืืช', 'ืื ื—ื ื•', 'ืืชื', 'ืืชืŸ', 'ื”ื', 'ื”ืŸ'}
17
- hebrew_stopwords.update(additional_stopwords)
18
- return ' '.join(word for word in text.split() if word not in hebrew_stopwords)
19
-
20
-
21
- # Preprocess the new text (remove punctuation and custom stop words)
22
- # ืื ืจื•ืฆื™ื ืœื”ื—ื–ื™ืจ ืืช ื”ืคื•ื ืงืฆื™ื™ื” ื”ืœื ืคืขื™ืœื” ื™ืฉ ืœื”ืขื‘ื™ืจ ืืช ื”ืžืฉืชื ื” ืื—ืจื™ ื”ืžืฉืชื ื” new_text
23
- new_text_cleaned = remove_custom_stopwords(remove_punctuation(new_text))
24
- """
25
-
26
-
27
- # Load the trained model from the file
28
- loaded_classifier = joblib.load("is_this_bible_model.pkl")
29
-
30
- # Load the TF-IDF vectorizer used for training
31
- vectorizer = joblib.load("is_this_bible_vectorizer.pkl")
32
-
33
- def parse_text(new_text):
34
- # Transform the new text using the TF-IDF vectorizer
35
- new_text_tfidf = vectorizer.transform([new_text])
36
-
37
- # Make predictions on the new text
38
- prediction = loaded_classifier.predict(new_text_tfidf)
39
-
40
- # Get the confidence score for the predicted class
41
- probabilities = loaded_classifier.predict_proba(new_text_tfidf)
42
- confidence_score = probabilities[0, 1] # The confidence score for class "Bible" (index 1)
43
-
44
- # Print the prediction and the confidence score
45
- print(f"Text: {new_text} | Prediction: {'Bible' if prediction[0] == 1 else 'Other'} | Confidence Score: {confidence_score:.4f}")
46
-
47
-
48
- text_list = [
49
- 'ืื ื™ ื™ื•ืฉื‘ ืคื” ื‘ืฉืงื˜ ื•ืžืงืœืœ ืืช ื”ืขื•ื‘ื“ื” ืฉื—ืœืง ืžื”ืชื•ื›ื ื•ืช ืฉืื ื™ ืžืชื—ื–ืง ืงืฉื•ืจื” ืœืคื™ื™ืชื•ืŸ 2.4, ืฉืื™ืŸ ืœื” ืืช ื–ื”',
50
- 'ื›ืžื” ื™ืคื” ื•ื ืื” ื›ืฉืฉื•ืžืขื™ื ื”ืฉื™ืจื” ืฉืœื”ื',
51
- 'ื•ื”ื™ื” ื‘ืขืช ื”ื”ื™ื ืื—ืคืฉ ืืช ื™ืจื•ืฉืœื™ื ื‘ื ืจื•ืช ื•ื”ื•ื“ืขืชื™ื” ืืช ื›ืœ ืชื•ืขื‘ื•ืชื™ื”',
52
- 'ื•ื”ื™ื ืฉืขืžื“ื” ืœืื‘ื•ืชื™ื ื• ื•ืœื ื• ืฉืœื ืื—ื“ ื‘ืœื‘ื“ ืขืžื“ ืขืœื™ื ื• ืœื›ืœื•ืชื™ื ื•',
53
- 'ืื ื™ ื”ืกืชื›ืœืชื™ ืœืฉืžื™ื ืืชื” ืฆืœืœืช ื‘ืžื™ื',
54
- 'ื”ืฆื‘ ื”ื•ื ื‘ืขืœ ื—ื™ื™ื ืฉื—ื™ ื‘ื™ื ื•ื‘ื™ื‘ืฉื”',
55
- 'ื•ื”ื™ื” ื”ื ืฉืืจ ื‘ืฆื™ื•ืŸ ื•ื”ื ื•ืชืจ ื‘ื™ืจื•ืฉืœื™ื ืงื“ื•ืฉ ื™ืืžืจ ืœื•',
56
- 'ืฉื™ืจ ื”ืฉื™ืจื™ื ืืฉืจ ืœืฉืœืžื”',
57
- 'ื™ืฉืงื ื™ ืžื ืฉื™ืงื•ืช ืคื™ื”ื• ื›ื™ ื˜ื•ื‘ื™ื ื“ื•ื“ื™ืš ืžื™ื™ืŸ',
58
- 'ื•ื”ื™ื” ืจืง ืžืœื ืฉืžื—ื” ื•ื—ื“ื•ื” ืชืžื™ื“ ื›ืฉื”ื™ื” ื’ื•ืžืจ ื”ืžื ืขืœ ื•ืžืŸ ื”ืกืชื ื”ื™ื” ืœื• ืฉืœืฉื” ืงืฆื•ื•ืช',
59
- 'ื–ื” ืžืขืฉื” ืฉืœื• ื•ื–ื” ืžืขืฉื” ืฉืœื™ ื•ืขื•ื“ ืžื” ืœื ื• ืœื“ื‘ืจ ืžืื—ืจื™ื',
60
- 'ื“ื•ื“ื™ ื™ืจื“ ืœื’ื ื• ืœืขืจื•ื’ื•ืช ื”ื‘ื•ืฉื ืœืจืขื•ืช ื‘ื’ื ื™ื ื•ืœืœืงื•ื˜ ืฉื•ืฉื ื™ื',
61
- 'ื•ื™ืžืจื• ื‘ื™ ื‘ื™ืช ื™ืฉืจืืœ ื‘ืžื“ื‘ืจ ื‘ื—ืงื•ืชื™ ืœื ื”ืœื›ื• ื•ืืช ืžืฉืคื˜ื™ ืžืืกื• ืืฉืจ ื™ืขืฉื” ืืชื ื”ืื“ื ื•ื—ื™ ื‘ื”ื',
62
- 'ื–ื” ืœื ืžืฉื ื” ืื•ืคื ื™ื™ื ื ืขืœื™ื™ื ื”ืขื™ืงืจ ื–ื” ื‘ื—ื™ื™ื',
63
- 'ื–ื›ื•ืจ ืืช ื™ื•ื ื”ืฉื‘ืช ืœืงื“ืฉื•',
64
- 'ื•ื™ืฉืœื— ื™ืขืงื‘ ืžืœืื›ื™ื ืœืคื ื™ื• ืืœ ืขืฉื™ื• ืื—ื™ื•',
65
- 'ืœืš ืœืš ืžืืจืฆืš ื•ืžืžื•ืœื“ืชืš ื•ืžื‘ื™ืช ืื‘ื™ืš',
66
- 'ืขื“ื›ื•ืŸ :ื“ื•ืจ ืœื“ื•ืจ ืชื "ืš ,ืžืื•ืจืขื•ืช ื‘ื–ืžืŸ ื”ืชื "ืš ืงืจื“ื™ื˜']
67
-
68
- if argv[1:]:
69
- new_text = argv[1]
70
- parse_text(new_text)
71
-
72
- else:
73
- for new_text in text_list:
74
- parse_text(new_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_creation/try_model.py โ†’ try_talmud_or_bible.py RENAMED
File without changes