raghuv-aditya commited on
Commit
68a165d
β€’
1 Parent(s): 4b5594c

Create text_processing.py

Browse files
Files changed (1) hide show
  1. text_processing.py +35 -0
text_processing.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def sanitize_text(text):
2
+ """
3
+ Cleans and standardizes text by replacing special characters.
4
+
5
+ Args:
6
+ text (str): Text to sanitize.
7
+
8
+ Returns:
9
+ str: Sanitized text.
10
+ """
11
+ if isinstance(text, str):
12
+ text = text.replace("’", "'").replace("β€˜", "'") \
13
+ .replace("β€œ", '"').replace("”", '"') \
14
+ .replace("–", "-").replace("β€”", "-")
15
+ return text
16
+
17
+
18
+ def generate_text(course_data):
19
+ """
20
+ Formats scraped course data into structured text.
21
+
22
+ Args:
23
+ course_data (list): List of dictionaries containing course data.
24
+
25
+ Returns:
26
+ str: Formatted text of all courses.
27
+ """
28
+ all_text = ""
29
+ for course in course_data:
30
+ all_text += f"## {sanitize_text(course['Title'])}\n\n"
31
+ all_text += f"**Description:**\n{sanitize_text(course['Description'])}\n\n"
32
+ all_text += f"**Curriculum:**\n{sanitize_text(course['Curriculum'])}\n\n"
33
+ all_text += f"**Link:**\n{sanitize_text(course['Link'])}\n\n"
34
+ all_text += "-------------------------------------\n\n"
35
+ return all_text