oceansweep commited on
Commit
02da3a7
·
verified ·
1 Parent(s): a2369d8

Update App_Function_Libraries/Utils/Utils.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Utils/Utils.py +861 -861
App_Function_Libraries/Utils/Utils.py CHANGED
@@ -1,861 +1,861 @@
1
- # Utils.py
2
- #########################################
3
- # General Utilities Library
4
- # This library is used to hold random utilities used by various other libraries.
5
- #
6
- ####
7
- ####################
8
- # Function List
9
- #
10
- # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
- # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
- # 3. verify_checksum(file_path, expected_checksum)
13
- # 4. create_download_directory(title)
14
- # 5. sanitize_filename(filename)
15
- # 6. normalize_title(title)
16
- # 7.
17
- #
18
- ####################
19
- #
20
- # Import necessary libraries
21
- import chardet
22
- import configparser
23
- import hashlib
24
- import json
25
- import logging
26
- import os
27
- import re
28
- import tempfile
29
- import time
30
- import uuid
31
- from datetime import timedelta
32
- from typing import Union, AnyStr
33
- from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
34
- #
35
- # Non-Local Imports
36
- import requests
37
- import unicodedata
38
- from tqdm import tqdm
39
- #
40
- #######################################################################################################################
41
- #
42
- # Function Definitions
43
-
44
- def extract_text_from_segments(segments, include_timestamps=True):
45
- logging.debug(f"Segments received: {segments}")
46
- logging.debug(f"Type of segments: {type(segments)}")
47
-
48
- def extract_text_recursive(data, include_timestamps):
49
- if isinstance(data, dict):
50
- text = data.get('Text', '')
51
- if include_timestamps and 'Time_Start' in data and 'Time_End' in data:
52
- return f"{data['Time_Start']:.2f}s - {data['Time_End']:.2f}s | {text}"
53
- for key, value in data.items():
54
- if key == 'Text':
55
- return value
56
- elif isinstance(value, (dict, list)):
57
- result = extract_text_recursive(value, include_timestamps)
58
- if result:
59
- return result
60
- elif isinstance(data, list):
61
- return '\n'.join(filter(None, [extract_text_recursive(item, include_timestamps) for item in data]))
62
- return None
63
-
64
- text = extract_text_recursive(segments, include_timestamps)
65
-
66
- if text:
67
- return text.strip()
68
- else:
69
- logging.error(f"Unable to extract text from segments: {segments}")
70
- return "Error: Unable to extract transcription"
71
-
72
- #
73
- #
74
- #######################
75
- # Temp file cleanup
76
- #
77
- # Global list to keep track of downloaded files
78
- downloaded_files = []
79
-
80
- def cleanup_downloads():
81
- """Function to clean up downloaded files when the server exits."""
82
- for file_path in downloaded_files:
83
- try:
84
- if os.path.exists(file_path):
85
- os.remove(file_path)
86
- print(f"Cleaned up file: {file_path}")
87
- except Exception as e:
88
- print(f"Error cleaning up file {file_path}: {e}")
89
-
90
- #
91
- #
92
- #######################################################################################################################
93
-
94
-
95
- #######################################################################################################################
96
- # Config loading
97
- #
98
- def load_comprehensive_config():
99
- # Get the directory of the current script (Utils.py)
100
- current_dir = os.path.dirname(os.path.abspath(__file__))
101
- logging.debug(f"Current directory: {current_dir}")
102
-
103
- # Go up two levels to the project root directory (tldw)
104
- project_root = os.path.dirname(os.path.dirname(current_dir))
105
- logging.debug(f"Project root directory: {project_root}")
106
-
107
- # Construct the path to the config file
108
- config_path = os.path.join(project_root, 'Config_Files', 'config.txt')
109
- logging.debug(f"Config file path: {config_path}")
110
-
111
- # Check if the config file exists
112
- if not os.path.exists(config_path):
113
- logging.error(f"Config file not found at {config_path}")
114
- raise FileNotFoundError(f"Config file not found at {config_path}")
115
-
116
- # Read the config file
117
- config = configparser.ConfigParser()
118
- config.read(config_path)
119
-
120
- # Log the sections found in the config file
121
- logging.debug("load_comprehensive_config(): Sections found in config: {config.sections()}")
122
-
123
- return config
124
-
125
-
126
- def get_project_root():
127
- """Get the absolute path to the project root directory."""
128
- current_dir = os.path.dirname(os.path.abspath(__file__))
129
- project_root = os.path.dirname(os.path.dirname(current_dir))
130
- logging.debug(f"Project root: {project_root}")
131
- return project_root
132
-
133
-
134
- def get_database_dir():
135
- """Get the absolute path to the database directory."""
136
- db_dir = os.path.join(get_project_root(), 'Databases')
137
- os.makedirs(db_dir, exist_ok=True)
138
- logging.debug(f"Database directory: {db_dir}")
139
- return db_dir
140
-
141
-
142
- def get_database_path(db_name: str) -> str:
143
- """
144
- Get the full absolute path for a database file.
145
- Ensures the path is always within the Databases directory.
146
- """
147
- # Remove any directory traversal attempts
148
- safe_db_name = os.path.basename(db_name)
149
- path = os.path.join(get_database_dir(), safe_db_name)
150
- logging.debug(f"Database path for {safe_db_name}: {path}")
151
- return path
152
-
153
-
154
- def get_project_relative_path(relative_path: Union[str, os.PathLike[AnyStr]]) -> str:
155
- """Convert a relative path to a path relative to the project root."""
156
- path = os.path.join(get_project_root(), str(relative_path))
157
- logging.debug(f"Project relative path for {relative_path}: {path}")
158
- return path
159
-
160
- def get_chromadb_path():
161
- path = os.path.join(get_project_root(), 'Databases', 'chroma_db')
162
- logging.debug(f"ChromaDB path: {path}")
163
- return path
164
-
165
- def ensure_directory_exists(path):
166
- """Ensure that a directory exists, creating it if necessary."""
167
- os.makedirs(path, exist_ok=True)
168
-
169
- # FIXME - update to include prompt path in return statement
170
- def load_and_log_configs():
171
- try:
172
- config = load_comprehensive_config()
173
- if config is None:
174
- logging.error("Config is None, cannot proceed")
175
- return None
176
- # API Keys
177
- anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
178
- logging.debug(
179
- f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
180
-
181
- cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
182
- logging.debug(
183
- f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
184
-
185
- groq_api_key = config.get('API', 'groq_api_key', fallback=None)
186
- logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
187
-
188
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
189
- logging.debug(
190
- f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
191
-
192
- huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
193
- logging.debug(
194
- f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
195
-
196
- openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
197
- logging.debug(
198
- f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
199
-
200
- deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
201
- logging.debug(
202
- f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
203
-
204
- mistral_api_key = config.get('API', 'mistral_api_key', fallback=None)
205
- logging.debug(
206
- f"Loaded Mistral API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
207
-
208
- # Models
209
- anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
210
- cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
211
- groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
212
- openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
213
- huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
214
- openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
215
- deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
216
- mistral_model = config.get('API', 'mistral_model', fallback='mistral-large-latest')
217
-
218
- logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
219
- logging.debug(f"Loaded Cohere Model: {cohere_model}")
220
- logging.debug(f"Loaded Groq Model: {groq_model}")
221
- logging.debug(f"Loaded OpenAI Model: {openai_model}")
222
- logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
223
- logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
224
- logging.debug(f"Loaded Deepseek Model: {deepseek_model}")
225
- logging.debug(f"Loaded Mistral Model: {mistral_model}")
226
-
227
- # Local-Models
228
- kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
229
- kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
230
-
231
- llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
232
- llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
233
-
234
- ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
235
- ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
236
-
237
- tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
238
- tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
239
- tabby_model = config.get('models', 'tabby_model', fallback=None)
240
-
241
- vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
242
- vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
243
- vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
244
-
245
- ollama_api_url = config.get('Local-API', 'ollama_api_IP', fallback='http://127.0.0.1:11434/api/generate')
246
- ollama_api_key = config.get('Local-API', 'ollama_api_key', fallback=None)
247
- ollama_model = config.get('Local-API', 'ollama_model', fallback=None)
248
-
249
- aphrodite_api_url = config.get('Local-API', 'aphrodite_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
250
- aphrodite_api_key = config.get('Local-API', 'aphrodite_api_key', fallback='')
251
-
252
- custom_openai_api_key = config.get('API', 'custom_openai_api_key', fallback=None)
253
- custom_openai_api_url = config.get('API', 'custom_openai_url', fallback=None)
254
- logging.debug(
255
- f"Loaded Custom openai-like endpoint API Key: {custom_openai_api_key[:5]}...{custom_openai_api_key[-5:] if custom_openai_api_key else None}")
256
-
257
- logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
258
- logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
259
- logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
260
- logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
261
- logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
262
-
263
- # Retrieve default API choices from the configuration file
264
- default_api = config.get('API', 'default_api', fallback='openai')
265
-
266
- # Retrieve output paths from the configuration file
267
- output_path = config.get('Paths', 'output_path', fallback='results')
268
- logging.debug(f"Output path set to: {output_path}")
269
-
270
- # Retrieve processing choice from the configuration file
271
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
272
- logging.debug(f"Processing choice set to: {processing_choice}")
273
-
274
- # Retrieve Embedding model settings from the configuration file
275
- embedding_model = config.get('Embeddings', 'embedding_model', fallback='')
276
- logging.debug(f"Embedding model set to: {embedding_model}")
277
- embedding_provider = config.get('Embeddings', 'embedding_provider', fallback='')
278
- embedding_model = config.get('Embeddings', 'embedding_model', fallback='')
279
- onnx_model_path = config.get('Embeddings', 'onnx_model_path', fallback="./App_Function_Libraries/onnx_models/text-embedding-3-small.onnx")
280
- model_dir = config.get('Embeddings', 'model_dir', fallback="./App_Function_Libraries/onnx_models")
281
- embedding_api_url = config.get('Embeddings', 'embedding_api_url', fallback="http://localhost:8080/v1/embeddings")
282
- embedding_api_key = config.get('Embeddings', 'embedding_api_key', fallback='')
283
- chunk_size = config.get('Embeddings', 'chunk_size', fallback=400)
284
- overlap = config.get('Embeddings', 'overlap', fallback=200)
285
-
286
- # Prompts - FIXME
287
- prompt_path = config.get('Prompts', 'prompt_path', fallback='Databases/prompts.db')
288
-
289
- # Auto-Save Values
290
- save_character_chats = config.get('Auto-Save', 'save_character_chats', fallback='False')
291
- save_rag_chats = config.get('Auto-Save', 'save_rag_chats', fallback='False')
292
-
293
- return {
294
- 'api_keys': {
295
- 'anthropic': anthropic_api_key,
296
- 'cohere': cohere_api_key,
297
- 'groq': groq_api_key,
298
- 'openai': openai_api_key,
299
- 'huggingface': huggingface_api_key,
300
- 'openrouter': openrouter_api_key,
301
- 'deepseek': deepseek_api_key,
302
- 'mistral': mistral_api_key,
303
- 'kobold': kobold_api_key,
304
- 'llama': llama_api_key,
305
- 'ooba': ooba_api_key,
306
- 'tabby': tabby_api_key,
307
- 'vllm': vllm_api_key,
308
- 'ollama': ollama_api_key,
309
- 'aphrodite': aphrodite_api_key,
310
- 'custom_openai_api_key': custom_openai_api_key
311
- },
312
- 'models': {
313
- 'anthropic': anthropic_model,
314
- 'cohere': cohere_model,
315
- 'groq': groq_model,
316
- 'openai': openai_model,
317
- 'huggingface': huggingface_model,
318
- 'openrouter': openrouter_model,
319
- 'deepseek': deepseek_model,
320
- 'mistral': mistral_model,
321
- 'vllm': vllm_model,
322
- 'tabby': tabby_model,
323
- 'ollama': ollama_model
324
-
325
- },
326
- 'local_api_ip': {
327
- 'kobold': kobold_api_ip,
328
- 'llama': llama_api_IP,
329
- 'ooba': ooba_api_IP,
330
- 'tabby': tabby_api_IP,
331
- 'vllm': vllm_api_url,
332
- 'ollama': ollama_api_url,
333
- 'aphrodite': aphrodite_api_url,
334
- 'custom_openai_api_ip': custom_openai_api_url
335
- },
336
- 'output_path': output_path,
337
- 'processing_choice': processing_choice,
338
- 'db_config': {
339
- 'prompt_path': get_project_relative_path(config.get('Prompts', 'prompt_path', fallback='Databases/prompts.db')),
340
- 'db_type': config.get('Database', 'type', fallback='sqlite'),
341
- 'sqlite_path': get_project_relative_path(config.get('Database', 'sqlite_path', fallback='Databases/media_summary.db')),
342
- 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
343
- 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200),
344
- 'chroma_db_path': get_project_relative_path(config.get('Database', 'chroma_db_path', fallback='Databases/chroma.db'))
345
- },
346
- 'embedding_config': {
347
- 'embedding_provider': embedding_provider,
348
- 'embedding_model': embedding_model,
349
- 'onnx_model_path': onnx_model_path,
350
- 'model_dir': model_dir,
351
- 'embedding_api_url': embedding_api_url,
352
- 'embedding_api_key': embedding_api_key,
353
- 'chunk_size': chunk_size,
354
- 'overlap': overlap
355
- },
356
- 'auto-save': {
357
- 'save_character_chats': save_character_chats,
358
- 'save_rag_chats': save_rag_chats,
359
- },
360
- 'default_api': default_api
361
- }
362
-
363
- except Exception as e:
364
- logging.error(f"Error loading config: {str(e)}")
365
- return None
366
-
367
- global_api_endpoints = ["anthropic", "cohere", "groq", "openai", "huggingface", "openrouter", "deepseek", "mistral", "custom_openai_api", "llama", "ooba", "kobold", "tabby", "vllm", "ollama", "aphrodite"]
368
-
369
- # Setup Default API Endpoint
370
- loaded_config_data = load_and_log_configs()
371
- default_api_endpoint = loaded_config_data['default_api']
372
-
373
- def format_api_name(api):
374
- name_mapping = {
375
- "openai": "OpenAI",
376
- "anthropic": "Anthropic",
377
- "cohere": "Cohere",
378
- "groq": "Groq",
379
- "huggingface": "HuggingFace",
380
- "openrouter": "OpenRouter",
381
- "deepseek": "DeepSeek",
382
- "mistral": "Mistral",
383
- "custom_openai_api": "Custom-OpenAI-API",
384
- "llama": "Llama.cpp",
385
- "ooba": "Ooba",
386
- "kobold": "Kobold",
387
- "tabby": "Tabbyapi",
388
- "vllm": "VLLM",
389
- "ollama": "Ollama",
390
- "aphrodite": "Aphrodite"
391
- }
392
- return name_mapping.get(api, api.title())
393
- print(f"Default API Endpoint: {default_api_endpoint}")
394
-
395
-
396
-
397
- #
398
- # End of Config loading
399
- #######################################################################################################################
400
-
401
-
402
- #######################################################################################################################
403
- #
404
- # Prompt Handling Functions
405
-
406
-
407
-
408
- #
409
- # End of Prompt Handling Functions
410
- ### #############################################################################################################
411
-
412
- #######################################################################################################################
413
- #
414
- # Misc-Functions
415
-
416
- # Log file
417
- # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
418
-
419
- def format_metadata_as_text(metadata):
420
- if not metadata:
421
- return "No metadata available"
422
-
423
- formatted_text = "Video Metadata:\n"
424
- for key, value in metadata.items():
425
- if value is not None:
426
- if isinstance(value, list):
427
- # Join list items with commas
428
- formatted_value = ", ".join(str(item) for item in value)
429
- elif key == 'upload_date' and len(str(value)) == 8:
430
- # Format date as YYYY-MM-DD
431
- formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
432
- elif key in ['view_count', 'like_count']:
433
- # Format large numbers with commas
434
- formatted_value = f"{value:,}"
435
- elif key == 'duration':
436
- # Convert seconds to HH:MM:SS format
437
- hours, remainder = divmod(value, 3600)
438
- minutes, seconds = divmod(remainder, 60)
439
- formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
440
- else:
441
- formatted_value = str(value)
442
-
443
- # Replace underscores with spaces in the key name
444
- formatted_key = key.replace('_', ' ').capitalize()
445
- formatted_text += f"{formatted_key}: {formatted_value}\n"
446
- return formatted_text.strip()
447
-
448
- # # Example usage:
449
- # example_metadata = {
450
- # 'title': 'Sample Video Title',
451
- # 'uploader': 'Channel Name',
452
- # 'upload_date': '20230615',
453
- # 'view_count': 1000000,
454
- # 'like_count': 50000,
455
- # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
456
- # 'tags': ['tag1', 'tag2', 'tag3'],
457
- # 'description': 'This is a sample video description.'
458
- # }
459
- #
460
- # print(format_metadata_as_text(example_metadata))
461
-
462
-
463
- def convert_to_seconds(time_str):
464
- if not time_str:
465
- return 0
466
-
467
- # If it's already a number, assume it's in seconds
468
- if time_str.isdigit():
469
- return int(time_str)
470
-
471
- # Parse time string in format HH:MM:SS, MM:SS, or SS
472
- time_parts = time_str.split(':')
473
- if len(time_parts) == 3:
474
- return int(timedelta(hours=int(time_parts[0]),
475
- minutes=int(time_parts[1]),
476
- seconds=int(time_parts[2])).total_seconds())
477
- elif len(time_parts) == 2:
478
- return int(timedelta(minutes=int(time_parts[0]),
479
- seconds=int(time_parts[1])).total_seconds())
480
- elif len(time_parts) == 1:
481
- return int(time_parts[0])
482
- else:
483
- raise ValueError(f"Invalid time format: {time_str}")
484
-
485
- #
486
- # End of Misc-Functions
487
- #######################################################################################################################
488
-
489
-
490
- #######################################################################################################################
491
- #
492
- # File-saving Function Definitions
493
- def save_to_file(video_urls, filename):
494
- with open(filename, 'w') as file:
495
- file.write('\n'.join(video_urls))
496
- print(f"Video URLs saved to {filename}")
497
-
498
-
499
- def save_segments_to_json(segments, file_name="transcription_segments.json"):
500
- """
501
- Save transcription segments to a JSON file.
502
-
503
- Parameters:
504
- segments (list): List of transcription segments
505
- file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
506
-
507
- Returns:
508
- str: Path to the saved JSON file
509
- """
510
- # Ensure the Results directory exists
511
- os.makedirs("Results", exist_ok=True)
512
-
513
- # Full path for the JSON file
514
- json_file_path = os.path.join("Results", file_name)
515
-
516
- # Save segments to JSON file
517
- with open(json_file_path, 'w', encoding='utf-8') as json_file:
518
- json.dump(segments, json_file, ensure_ascii=False, indent=4)
519
-
520
- return json_file_path
521
-
522
-
523
- def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
524
- temp_path = dest_path + '.tmp'
525
-
526
- for attempt in range(max_retries):
527
- try:
528
- # Check if a partial download exists and get its size
529
- resume_header = {}
530
- if os.path.exists(temp_path):
531
- resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
532
-
533
- response = requests.get(url, stream=True, headers=resume_header)
534
- response.raise_for_status()
535
-
536
- # Get the total file size from headers
537
- total_size = int(response.headers.get('content-length', 0))
538
- initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
539
-
540
- mode = 'ab' if 'Range' in response.headers else 'wb'
541
- with open(temp_path, mode) as temp_file, tqdm(
542
- total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
543
- ) as pbar:
544
- for chunk in response.iter_content(chunk_size=8192):
545
- if chunk: # filter out keep-alive new chunks
546
- temp_file.write(chunk)
547
- pbar.update(len(chunk))
548
-
549
- # Verify the checksum if provided
550
- if expected_checksum:
551
- if not verify_checksum(temp_path, expected_checksum):
552
- os.remove(temp_path)
553
- raise ValueError("Downloaded file's checksum does not match the expected checksum")
554
-
555
- # Move the file to the final destination
556
- os.rename(temp_path, dest_path)
557
- print("Download complete and verified!")
558
- return dest_path
559
-
560
- except Exception as e:
561
- print(f"Attempt {attempt + 1} failed: {e}")
562
- if attempt < max_retries - 1:
563
- print(f"Retrying in {delay} seconds...")
564
- time.sleep(delay)
565
- else:
566
- print("Max retries reached. Download failed.")
567
- raise
568
-
569
- def create_download_directory(title):
570
- base_dir = "Results"
571
- # Remove characters that are illegal in Windows filenames and normalize
572
- safe_title = normalize_title(title, preserve_spaces=False)
573
- logging.debug(f"{title} successfully normalized")
574
- session_path = os.path.join(base_dir, safe_title)
575
- if not os.path.exists(session_path):
576
- os.makedirs(session_path, exist_ok=True)
577
- logging.debug(f"Created directory for downloaded video: {session_path}")
578
- else:
579
- logging.debug(f"Directory already exists for downloaded video: {session_path}")
580
- return session_path
581
-
582
-
583
- import chardet
584
- import logging
585
-
586
- def safe_read_file(file_path):
587
- encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-8-sig']
588
-
589
- logging.info(f"Attempting to read file: {file_path}")
590
-
591
- try:
592
- with open(file_path, 'rb') as file:
593
- raw_data = file.read()
594
- except FileNotFoundError:
595
- logging.error(f"File not found: {file_path}")
596
- return f"File not found: {file_path}"
597
- except Exception as e:
598
- logging.error(f"An error occurred while reading the file: {e}")
599
- return f"An error occurred while reading the file: {e}"
600
-
601
- if not raw_data:
602
- logging.warning(f"File is empty: {file_path}")
603
- return ""
604
-
605
- # Use chardet to detect the encoding
606
- detected = chardet.detect(raw_data)
607
- if detected['encoding'] is not None:
608
- encodings.insert(0, detected['encoding'])
609
- logging.info(f"Detected encoding: {detected['encoding']}")
610
-
611
- for encoding in encodings:
612
- try:
613
- decoded_content = raw_data.decode(encoding)
614
- # Check if the content is mostly printable
615
- if sum(c.isprintable() for c in decoded_content) / len(decoded_content) > 0.95:
616
- logging.info(f"Successfully decoded file with encoding: {encoding}")
617
- return decoded_content
618
- except UnicodeDecodeError:
619
- logging.debug(f"Failed to decode with {encoding}")
620
- continue
621
-
622
- # If all decoding attempts fail, return the error message
623
- logging.error(f"Unable to decode the file {file_path}")
624
- return f"Unable to decode the file {file_path}"
625
-
626
-
627
- #
628
- # End of Files-saving Function Definitions
629
- #######################################################################################################################
630
-
631
-
632
- #######################################################################################################################
633
- #
634
- # UUID-Functions
635
-
636
- def generate_unique_filename(base_path, base_filename):
637
- """Generate a unique filename by appending a counter if necessary."""
638
- filename = base_filename
639
- counter = 1
640
- while os.path.exists(os.path.join(base_path, filename)):
641
- name, ext = os.path.splitext(base_filename)
642
- filename = f"{name}_{counter}{ext}"
643
- counter += 1
644
- return filename
645
-
646
-
647
- def generate_unique_identifier(file_path):
648
- filename = os.path.basename(file_path)
649
- timestamp = int(time.time())
650
-
651
- # Generate a hash of the file content
652
- hasher = hashlib.md5()
653
- with open(file_path, 'rb') as f:
654
- buf = f.read()
655
- hasher.update(buf)
656
- content_hash = hasher.hexdigest()[:8] # Use first 8 characters of the hash
657
-
658
- return f"local:{timestamp}:{content_hash}:{filename}"
659
-
660
- #
661
- # End of UUID-Functions
662
- #######################################################################################################################
663
-
664
-
665
- #######################################################################################################################
666
- #
667
- # Backup code
668
-
669
- #
670
- # End of backup code
671
- #######################################################################################################################
672
-
673
-
674
- #######################################################################################################################
675
- #
676
- # Sanitization/Verification Functions
677
-
678
- # Helper function to validate URL format
679
- def is_valid_url(url: str) -> bool:
680
- regex = re.compile(
681
- r'^(?:http|ftp)s?://' # http:// or https://
682
- r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
683
- r'localhost|' # localhost...
684
- r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
685
- r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
686
- r'(?::\d+)?' # optional port
687
- r'(?:/?|[/?]\S+)$', re.IGNORECASE)
688
- return re.match(regex, url) is not None
689
-
690
-
691
- def verify_checksum(file_path, expected_checksum):
692
- sha256_hash = hashlib.sha256()
693
- with open(file_path, 'rb') as f:
694
- for byte_block in iter(lambda: f.read(4096), b''):
695
- sha256_hash.update(byte_block)
696
- return sha256_hash.hexdigest() == expected_checksum
697
-
698
-
699
- def normalize_title(title, preserve_spaces=False):
700
- # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
701
- title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
702
-
703
- if preserve_spaces:
704
- # Replace special characters with underscores, but keep spaces
705
- title = re.sub(r'[^\w\s\-.]', '_', title)
706
- else:
707
- # Replace special characters and spaces with underscores
708
- title = re.sub(r'[^\w\-.]', '_', title)
709
-
710
- # Replace multiple consecutive underscores with a single underscore
711
- title = re.sub(r'_+', '_', title)
712
-
713
- # Replace specific characters with underscores
714
- title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '_').replace('*', '_').replace(
715
- '?', '_').replace(
716
- '<', '_').replace('>', '_').replace('|', '_')
717
-
718
- return title.strip('_')
719
-
720
-
721
-
722
- def clean_youtube_url(url):
723
- parsed_url = urlparse(url)
724
- query_params = parse_qs(parsed_url.query)
725
- if 'list' in query_params:
726
- query_params.pop('list')
727
- cleaned_query = urlencode(query_params, doseq=True)
728
- cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
729
- return cleaned_url
730
-
731
- def sanitize_filename(filename):
732
- # Remove invalid characters and replace spaces with underscores
733
- sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
734
- sanitized = re.sub(r'\s+', ' ', sanitized).strip()
735
- return sanitized
736
-
737
-
738
- def format_transcription(content):
739
- # Replace '\n' with actual line breaks
740
- content = content.replace('\\n', '\n')
741
- # Split the content by newlines first
742
- lines = content.split('\n')
743
- formatted_lines = []
744
- for line in lines:
745
- # Add extra space after periods for better readability
746
- line = line.replace('.', '. ').replace('. ', '. ')
747
-
748
- # Split into sentences using a more comprehensive regex
749
- sentences = re.split('(?<=[.!?]) +', line)
750
-
751
- # Trim whitespace from each sentence and add a line break
752
- formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
753
-
754
- # Join the formatted sentences
755
- formatted_lines.append(' '.join(formatted_sentences))
756
-
757
- # Join the lines with HTML line breaks
758
- formatted_content = '<br>'.join(formatted_lines)
759
-
760
- return formatted_content
761
-
762
- def sanitize_user_input(message):
763
- """
764
- Removes or escapes '{{' and '}}' to prevent placeholder injection.
765
-
766
- Args:
767
- message (str): The user's message.
768
-
769
- Returns:
770
- str: Sanitized message.
771
- """
772
- # Replace '{{' and '}}' with their escaped versions
773
- message = re.sub(r'\{\{', '{ {', message)
774
- message = re.sub(r'\}\}', '} }', message)
775
- return message
776
-
777
- def format_file_path(file_path, fallback_path=None):
778
- if file_path and os.path.exists(file_path):
779
- logging.debug(f"File exists: {file_path}")
780
- return file_path
781
- elif fallback_path and os.path.exists(fallback_path):
782
- logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}")
783
- return fallback_path
784
- else:
785
- logging.debug(f"File does not exist: {file_path}. No fallback path available.")
786
- return None
787
-
788
- #
789
- # End of Sanitization/Verification Functions
790
- #######################################################################################################################
791
-
792
-
793
- #######################################################################################################################
794
- #
795
- # DB Config Loading
796
-
797
-
798
- def get_db_config():
799
- # Get the directory of the current script
800
- current_dir = os.path.dirname(os.path.abspath(__file__))
801
- # Go up two levels to the project root directory (tldw)
802
- project_root = os.path.dirname(os.path.dirname(current_dir))
803
- # Construct the path to the config file
804
- config_path = os.path.join(project_root, 'Config_Files', 'config.txt')
805
- # Read the config file
806
- config = configparser.ConfigParser()
807
- config.read(config_path)
808
- # Return the database configuration
809
- return {
810
- 'type': config['Database']['type'],
811
- 'sqlite_path': config.get('Database', 'sqlite_path', fallback='./Databases/media_summary.db'),
812
- 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
813
- 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
814
- }
815
-
816
-
817
-
818
-
819
- #
820
- # End of DB Config Loading
821
- #######################################################################################################################
822
-
823
- def format_text_with_line_breaks(text):
824
- # Split the text into sentences and add line breaks
825
- sentences = text.replace('. ', '.<br>').replace('? ', '?<br>').replace('! ', '!<br>')
826
- return sentences
827
-
828
- #######################################################################################################################
829
- #
830
- # File Handling Functions
831
-
832
- # Track temp files for cleanup
833
- temp_files = []
834
- temp_file_paths = []
835
-
836
- def save_temp_file(file):
837
- global temp_files
838
- temp_dir = tempfile.gettempdir()
839
- temp_path = os.path.join(temp_dir, file.name)
840
- with open(temp_path, 'wb') as f:
841
- f.write(file.read())
842
- temp_files.append(temp_path)
843
- return temp_path
844
-
845
- def cleanup_temp_files():
846
- global temp_files
847
- for file_path in temp_files:
848
- if os.path.exists(file_path):
849
- try:
850
- os.remove(file_path)
851
- logging.info(f"Removed temporary file: {file_path}")
852
- except Exception as e:
853
- logging.error(f"Failed to remove temporary file {file_path}: {e}")
854
- temp_files.clear()
855
-
856
- def generate_unique_id():
857
- return f"uploaded_file_{uuid.uuid4()}"
858
-
859
- #
860
- # End of File Handling Functions
861
- #######################################################################################################################
 
1
+ # Utils.py
2
+ #########################################
3
+ # General Utilities Library
4
+ # This library is used to hold random utilities used by various other libraries.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
+ # 3. verify_checksum(file_path, expected_checksum)
13
+ # 4. create_download_directory(title)
14
+ # 5. sanitize_filename(filename)
15
+ # 6. normalize_title(title)
16
+ # 7.
17
+ #
18
+ ####################
19
+ #
20
+ # Import necessary libraries
21
+ import chardet
22
+ import configparser
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ import os
27
+ import re
28
+ import tempfile
29
+ import time
30
+ import uuid
31
+ from datetime import timedelta
32
+ from typing import Union, AnyStr
33
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
34
+ #
35
+ # Non-Local Imports
36
+ import requests
37
+ import unicodedata
38
+ from tqdm import tqdm
39
+ #
40
+ #######################################################################################################################
41
+ #
42
+ # Function Definitions
43
+
44
+ def extract_text_from_segments(segments, include_timestamps=True):
45
+ logging.debug(f"Segments received: {segments}")
46
+ logging.debug(f"Type of segments: {type(segments)}")
47
+
48
+ def extract_text_recursive(data, include_timestamps):
49
+ if isinstance(data, dict):
50
+ text = data.get('Text', '')
51
+ if include_timestamps and 'Time_Start' in data and 'Time_End' in data:
52
+ return f"{data['Time_Start']:.2f}s - {data['Time_End']:.2f}s | {text}"
53
+ for key, value in data.items():
54
+ if key == 'Text':
55
+ return value
56
+ elif isinstance(value, (dict, list)):
57
+ result = extract_text_recursive(value, include_timestamps)
58
+ if result:
59
+ return result
60
+ elif isinstance(data, list):
61
+ return '\n'.join(filter(None, [extract_text_recursive(item, include_timestamps) for item in data]))
62
+ return None
63
+
64
+ text = extract_text_recursive(segments, include_timestamps)
65
+
66
+ if text:
67
+ return text.strip()
68
+ else:
69
+ logging.error(f"Unable to extract text from segments: {segments}")
70
+ return "Error: Unable to extract transcription"
71
+
72
+ #
73
+ #
74
+ #######################
75
+ # Temp file cleanup
76
+ #
77
+ # Global list to keep track of downloaded files
78
+ downloaded_files = []
79
+
80
+ def cleanup_downloads():
81
+ """Function to clean up downloaded files when the server exits."""
82
+ for file_path in downloaded_files:
83
+ try:
84
+ if os.path.exists(file_path):
85
+ os.remove(file_path)
86
+ print(f"Cleaned up file: {file_path}")
87
+ except Exception as e:
88
+ print(f"Error cleaning up file {file_path}: {e}")
89
+
90
+ #
91
+ #
92
+ #######################################################################################################################
93
+
94
+
95
+ #######################################################################################################################
96
+ # Config loading
97
+ #
98
+ def load_comprehensive_config():
99
+ # Get the directory of the current script (Utils.py)
100
+ current_dir = os.path.dirname(os.path.abspath(__file__))
101
+ logging.debug(f"Current directory: {current_dir}")
102
+
103
+ # Go up two levels to the project root directory (tldw)
104
+ project_root = os.path.dirname(os.path.dirname(current_dir))
105
+ logging.debug(f"Project root directory: {project_root}")
106
+
107
+ # Construct the path to the config file
108
+ config_path = os.path.join(project_root, 'Config_Files', 'config.txt')
109
+ logging.debug(f"Config file path: {config_path}")
110
+
111
+ # Check if the config file exists
112
+ if not os.path.exists(config_path):
113
+ logging.error(f"Config file not found at {config_path}")
114
+ raise FileNotFoundError(f"Config file not found at {config_path}")
115
+
116
+ # Read the config file
117
+ config = configparser.ConfigParser()
118
+ config.read(config_path)
119
+
120
+ # Log the sections found in the config file
121
+ logging.debug("load_comprehensive_config(): Sections found in config: {config.sections()}")
122
+
123
+ return config
124
+
125
+
126
+ def get_project_root():
127
+ """Get the absolute path to the project root directory."""
128
+ current_dir = os.path.dirname(os.path.abspath(__file__))
129
+ project_root = os.path.dirname(os.path.dirname(current_dir))
130
+ logging.debug(f"Project root: {project_root}")
131
+ return project_root
132
+
133
+
134
+ def get_database_dir():
135
+ """Get the absolute path to the database directory."""
136
+ db_dir = os.path.join(get_project_root(), 'Databases')
137
+ os.makedirs(db_dir, exist_ok=True)
138
+ logging.debug(f"Database directory: {db_dir}")
139
+ return db_dir
140
+
141
+
142
+ def get_database_path(db_name: str) -> str:
143
+ """
144
+ Get the full absolute path for a database file.
145
+ Ensures the path is always within the Databases directory.
146
+ """
147
+ # Remove any directory traversal attempts
148
+ safe_db_name = os.path.basename(db_name)
149
+ path = os.path.join(get_database_dir(), safe_db_name)
150
+ logging.debug(f"Database path for {safe_db_name}: {path}")
151
+ return path
152
+
153
+
154
+ def get_project_relative_path(relative_path: Union[str, os.PathLike[AnyStr]]) -> str:
155
+ """Convert a relative path to a path relative to the project root."""
156
+ path = os.path.join(get_project_root(), str(relative_path))
157
+ logging.debug(f"Project relative path for {relative_path}: {path}")
158
+ return path
159
+
160
+ def get_chromadb_path():
161
+ path = os.path.join(get_project_root(), 'Databases', 'chroma_db')
162
+ logging.debug(f"ChromaDB path: {path}")
163
+ return path
164
+
165
+ def ensure_directory_exists(path):
166
+ """Ensure that a directory exists, creating it if necessary."""
167
+ os.makedirs(path, exist_ok=True)
168
+
169
+ # FIXME - update to include prompt path in return statement
170
+ def load_and_log_configs():
171
+ try:
172
+ config = load_comprehensive_config()
173
+ if config is None:
174
+ logging.error("Config is None, cannot proceed")
175
+ return None
176
+ # API Keys
177
+ anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
178
+ logging.debug(
179
+ f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
180
+
181
+ cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
182
+ logging.debug(
183
+ f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
184
+
185
+ groq_api_key = config.get('API', 'groq_api_key', fallback=None)
186
+ logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
187
+
188
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
189
+ logging.debug(
190
+ f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
191
+
192
+ huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
193
+ logging.debug(
194
+ f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
195
+
196
+ openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
197
+ logging.debug(
198
+ f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
199
+
200
+ deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
201
+ logging.debug(
202
+ f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
203
+
204
+ mistral_api_key = config.get('API', 'mistral_api_key', fallback=None)
205
+ logging.debug(
206
+ f"Loaded Mistral API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
207
+
208
+ # Models
209
+ anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
210
+ cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
211
+ groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
212
+ openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
213
+ huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
214
+ openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
215
+ deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
216
+ mistral_model = config.get('API', 'mistral_model', fallback='mistral-large-latest')
217
+
218
+ logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
219
+ logging.debug(f"Loaded Cohere Model: {cohere_model}")
220
+ logging.debug(f"Loaded Groq Model: {groq_model}")
221
+ logging.debug(f"Loaded OpenAI Model: {openai_model}")
222
+ logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
223
+ logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
224
+ logging.debug(f"Loaded Deepseek Model: {deepseek_model}")
225
+ logging.debug(f"Loaded Mistral Model: {mistral_model}")
226
+
227
+ # Local-Models
228
+ kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
229
+ kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
230
+
231
+ llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
232
+ llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
233
+
234
+ ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
235
+ ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
236
+
237
+ tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
238
+ tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
239
+ tabby_model = config.get('models', 'tabby_model', fallback=None)
240
+
241
+ vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
242
+ vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
243
+ vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
244
+
245
+ ollama_api_url = config.get('Local-API', 'ollama_api_IP', fallback='http://127.0.0.1:11434/api/generate')
246
+ ollama_api_key = config.get('Local-API', 'ollama_api_key', fallback=None)
247
+ ollama_model = config.get('Local-API', 'ollama_model', fallback=None)
248
+
249
+ aphrodite_api_url = config.get('Local-API', 'aphrodite_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
250
+ aphrodite_api_key = config.get('Local-API', 'aphrodite_api_key', fallback='')
251
+
252
+ custom_openai_api_key = config.get('API', 'custom_openai_api_key', fallback=None)
253
+ custom_openai_api_url = config.get('API', 'custom_openai_url', fallback=None)
254
+ logging.debug(
255
+ f"Loaded Custom openai-like endpoint API Key: {custom_openai_api_key[:5]}...{custom_openai_api_key[-5:] if custom_openai_api_key else None}")
256
+
257
+ logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
258
+ logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
259
+ logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
260
+ logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
261
+ logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
262
+
263
+ # Retrieve default API choices from the configuration file
264
+ default_api = config.get('API', 'default_api', fallback='openai')
265
+
266
+ # Retrieve output paths from the configuration file
267
+ output_path = config.get('Paths', 'output_path', fallback='results')
268
+ logging.debug(f"Output path set to: {output_path}")
269
+
270
+ # Retrieve processing choice from the configuration file
271
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
272
+ logging.debug(f"Processing choice set to: {processing_choice}")
273
+
274
+ # Retrieve Embedding model settings from the configuration file
275
+ embedding_model = config.get('Embeddings', 'embedding_model', fallback='')
276
+ logging.debug(f"Embedding model set to: {embedding_model}")
277
+ embedding_provider = config.get('Embeddings', 'embedding_provider', fallback='')
278
+ embedding_model = config.get('Embeddings', 'embedding_model', fallback='')
279
+ onnx_model_path = config.get('Embeddings', 'onnx_model_path', fallback="./App_Function_Libraries/onnx_models/text-embedding-3-small.onnx")
280
+ model_dir = config.get('Embeddings', 'model_dir', fallback="./App_Function_Libraries/onnx_models")
281
+ embedding_api_url = config.get('Embeddings', 'embedding_api_url', fallback="http://localhost:8080/v1/embeddings")
282
+ embedding_api_key = config.get('Embeddings', 'embedding_api_key', fallback='')
283
+ chunk_size = config.get('Embeddings', 'chunk_size', fallback=400)
284
+ overlap = config.get('Embeddings', 'overlap', fallback=200)
285
+
286
+ # Prompts - FIXME
287
+ prompt_path = config.get('Prompts', 'prompt_path', fallback='Databases/prompts.db')
288
+
289
+ # Auto-Save Values
290
+ save_character_chats = config.get('Auto-Save', 'save_character_chats', fallback='False')
291
+ save_rag_chats = config.get('Auto-Save', 'save_rag_chats', fallback='False')
292
+
293
+ return {
294
+ 'api_keys': {
295
+ 'anthropic': anthropic_api_key,
296
+ 'cohere': cohere_api_key,
297
+ 'groq': groq_api_key,
298
+ 'openai': openai_api_key,
299
+ 'huggingface': huggingface_api_key,
300
+ 'openrouter': openrouter_api_key,
301
+ 'deepseek': deepseek_api_key,
302
+ 'mistral': mistral_api_key,
303
+ 'kobold': kobold_api_key,
304
+ 'llama': llama_api_key,
305
+ 'ooba': ooba_api_key,
306
+ 'tabby': tabby_api_key,
307
+ 'vllm': vllm_api_key,
308
+ 'ollama': ollama_api_key,
309
+ 'aphrodite': aphrodite_api_key,
310
+ 'custom_openai_api_key': custom_openai_api_key
311
+ },
312
+ 'models': {
313
+ 'anthropic': anthropic_model,
314
+ 'cohere': cohere_model,
315
+ 'groq': groq_model,
316
+ 'openai': openai_model,
317
+ 'huggingface': huggingface_model,
318
+ 'openrouter': openrouter_model,
319
+ 'deepseek': deepseek_model,
320
+ 'mistral': mistral_model,
321
+ 'vllm': vllm_model,
322
+ 'tabby': tabby_model,
323
+ 'ollama': ollama_model
324
+
325
+ },
326
+ 'local_api_ip': {
327
+ 'kobold': kobold_api_ip,
328
+ 'llama': llama_api_IP,
329
+ 'ooba': ooba_api_IP,
330
+ 'tabby': tabby_api_IP,
331
+ 'vllm': vllm_api_url,
332
+ 'ollama': ollama_api_url,
333
+ 'aphrodite': aphrodite_api_url,
334
+ 'custom_openai_api_ip': custom_openai_api_url
335
+ },
336
+ 'output_path': output_path,
337
+ 'processing_choice': processing_choice,
338
+ 'db_config': {
339
+ 'prompt_path': get_project_relative_path(config.get('Prompts', 'prompt_path', fallback='Databases/prompts.db')),
340
+ 'db_type': config.get('Database', 'type', fallback='sqlite'),
341
+ 'sqlite_path': get_project_relative_path(config.get('Database', 'sqlite_path', fallback='Databases/media_summary.db')),
342
+ 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
343
+ 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200),
344
+ 'chroma_db_path': get_project_relative_path(config.get('Database', 'chroma_db_path', fallback='Databases/chroma.db'))
345
+ },
346
+ 'embedding_config': {
347
+ 'embedding_provider': embedding_provider,
348
+ 'embedding_model': embedding_model,
349
+ 'onnx_model_path': onnx_model_path,
350
+ 'model_dir': model_dir,
351
+ 'embedding_api_url': embedding_api_url,
352
+ 'embedding_api_key': embedding_api_key,
353
+ 'chunk_size': chunk_size,
354
+ 'overlap': overlap
355
+ },
356
+ 'auto-save': {
357
+ 'save_character_chats': save_character_chats,
358
+ 'save_rag_chats': save_rag_chats,
359
+ },
360
+ 'default_api': default_api
361
+ }
362
+
363
+ except Exception as e:
364
+ logging.error(f"Error loading config: {str(e)}")
365
+ return None
366
+
367
+ global_api_endpoints = ["anthropic", "cohere", "groq", "openai", "huggingface", "openrouter", "deepseek", "mistral", "custom_openai_api", "llama", "ooba", "kobold", "tabby", "vllm", "ollama", "aphrodite"]
368
+
369
+ # Setup Default API Endpoint
370
+ loaded_config_data = load_and_log_configs()
371
+ default_api_endpoint = "huggingface"
372
+
373
+ def format_api_name(api):
374
+ name_mapping = {
375
+ "openai": "OpenAI",
376
+ "anthropic": "Anthropic",
377
+ "cohere": "Cohere",
378
+ "groq": "Groq",
379
+ "huggingface": "HuggingFace",
380
+ "openrouter": "OpenRouter",
381
+ "deepseek": "DeepSeek",
382
+ "mistral": "Mistral",
383
+ "custom_openai_api": "Custom-OpenAI-API",
384
+ "llama": "Llama.cpp",
385
+ "ooba": "Ooba",
386
+ "kobold": "Kobold",
387
+ "tabby": "Tabbyapi",
388
+ "vllm": "VLLM",
389
+ "ollama": "Ollama",
390
+ "aphrodite": "Aphrodite"
391
+ }
392
+ return name_mapping.get(api, api.title())
393
+ print(f"Default API Endpoint: {default_api_endpoint}")
394
+
395
+
396
+
397
+ #
398
+ # End of Config loading
399
+ #######################################################################################################################
400
+
401
+
402
+ #######################################################################################################################
403
+ #
404
+ # Prompt Handling Functions
405
+
406
+
407
+
408
+ #
409
+ # End of Prompt Handling Functions
410
+ ### #############################################################################################################
411
+
412
+ #######################################################################################################################
413
+ #
414
+ # Misc-Functions
415
+
416
+ # Log file
417
+ # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
418
+
419
+ def format_metadata_as_text(metadata):
420
+ if not metadata:
421
+ return "No metadata available"
422
+
423
+ formatted_text = "Video Metadata:\n"
424
+ for key, value in metadata.items():
425
+ if value is not None:
426
+ if isinstance(value, list):
427
+ # Join list items with commas
428
+ formatted_value = ", ".join(str(item) for item in value)
429
+ elif key == 'upload_date' and len(str(value)) == 8:
430
+ # Format date as YYYY-MM-DD
431
+ formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
432
+ elif key in ['view_count', 'like_count']:
433
+ # Format large numbers with commas
434
+ formatted_value = f"{value:,}"
435
+ elif key == 'duration':
436
+ # Convert seconds to HH:MM:SS format
437
+ hours, remainder = divmod(value, 3600)
438
+ minutes, seconds = divmod(remainder, 60)
439
+ formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
440
+ else:
441
+ formatted_value = str(value)
442
+
443
+ # Replace underscores with spaces in the key name
444
+ formatted_key = key.replace('_', ' ').capitalize()
445
+ formatted_text += f"{formatted_key}: {formatted_value}\n"
446
+ return formatted_text.strip()
447
+
448
+ # # Example usage:
449
+ # example_metadata = {
450
+ # 'title': 'Sample Video Title',
451
+ # 'uploader': 'Channel Name',
452
+ # 'upload_date': '20230615',
453
+ # 'view_count': 1000000,
454
+ # 'like_count': 50000,
455
+ # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
456
+ # 'tags': ['tag1', 'tag2', 'tag3'],
457
+ # 'description': 'This is a sample video description.'
458
+ # }
459
+ #
460
+ # print(format_metadata_as_text(example_metadata))
461
+
462
+
463
+ def convert_to_seconds(time_str):
464
+ if not time_str:
465
+ return 0
466
+
467
+ # If it's already a number, assume it's in seconds
468
+ if time_str.isdigit():
469
+ return int(time_str)
470
+
471
+ # Parse time string in format HH:MM:SS, MM:SS, or SS
472
+ time_parts = time_str.split(':')
473
+ if len(time_parts) == 3:
474
+ return int(timedelta(hours=int(time_parts[0]),
475
+ minutes=int(time_parts[1]),
476
+ seconds=int(time_parts[2])).total_seconds())
477
+ elif len(time_parts) == 2:
478
+ return int(timedelta(minutes=int(time_parts[0]),
479
+ seconds=int(time_parts[1])).total_seconds())
480
+ elif len(time_parts) == 1:
481
+ return int(time_parts[0])
482
+ else:
483
+ raise ValueError(f"Invalid time format: {time_str}")
484
+
485
+ #
486
+ # End of Misc-Functions
487
+ #######################################################################################################################
488
+
489
+
490
+ #######################################################################################################################
491
+ #
492
+ # File-saving Function Definitions
493
+ def save_to_file(video_urls, filename):
494
+ with open(filename, 'w') as file:
495
+ file.write('\n'.join(video_urls))
496
+ print(f"Video URLs saved to {filename}")
497
+
498
+
499
+ def save_segments_to_json(segments, file_name="transcription_segments.json"):
500
+ """
501
+ Save transcription segments to a JSON file.
502
+
503
+ Parameters:
504
+ segments (list): List of transcription segments
505
+ file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
506
+
507
+ Returns:
508
+ str: Path to the saved JSON file
509
+ """
510
+ # Ensure the Results directory exists
511
+ os.makedirs("Results", exist_ok=True)
512
+
513
+ # Full path for the JSON file
514
+ json_file_path = os.path.join("Results", file_name)
515
+
516
+ # Save segments to JSON file
517
+ with open(json_file_path, 'w', encoding='utf-8') as json_file:
518
+ json.dump(segments, json_file, ensure_ascii=False, indent=4)
519
+
520
+ return json_file_path
521
+
522
+
523
+ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
524
+ temp_path = dest_path + '.tmp'
525
+
526
+ for attempt in range(max_retries):
527
+ try:
528
+ # Check if a partial download exists and get its size
529
+ resume_header = {}
530
+ if os.path.exists(temp_path):
531
+ resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
532
+
533
+ response = requests.get(url, stream=True, headers=resume_header)
534
+ response.raise_for_status()
535
+
536
+ # Get the total file size from headers
537
+ total_size = int(response.headers.get('content-length', 0))
538
+ initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
539
+
540
+ mode = 'ab' if 'Range' in response.headers else 'wb'
541
+ with open(temp_path, mode) as temp_file, tqdm(
542
+ total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
543
+ ) as pbar:
544
+ for chunk in response.iter_content(chunk_size=8192):
545
+ if chunk: # filter out keep-alive new chunks
546
+ temp_file.write(chunk)
547
+ pbar.update(len(chunk))
548
+
549
+ # Verify the checksum if provided
550
+ if expected_checksum:
551
+ if not verify_checksum(temp_path, expected_checksum):
552
+ os.remove(temp_path)
553
+ raise ValueError("Downloaded file's checksum does not match the expected checksum")
554
+
555
+ # Move the file to the final destination
556
+ os.rename(temp_path, dest_path)
557
+ print("Download complete and verified!")
558
+ return dest_path
559
+
560
+ except Exception as e:
561
+ print(f"Attempt {attempt + 1} failed: {e}")
562
+ if attempt < max_retries - 1:
563
+ print(f"Retrying in {delay} seconds...")
564
+ time.sleep(delay)
565
+ else:
566
+ print("Max retries reached. Download failed.")
567
+ raise
568
+
569
+ def create_download_directory(title):
570
+ base_dir = "Results"
571
+ # Remove characters that are illegal in Windows filenames and normalize
572
+ safe_title = normalize_title(title, preserve_spaces=False)
573
+ logging.debug(f"{title} successfully normalized")
574
+ session_path = os.path.join(base_dir, safe_title)
575
+ if not os.path.exists(session_path):
576
+ os.makedirs(session_path, exist_ok=True)
577
+ logging.debug(f"Created directory for downloaded video: {session_path}")
578
+ else:
579
+ logging.debug(f"Directory already exists for downloaded video: {session_path}")
580
+ return session_path
581
+
582
+
583
+ import chardet
584
+ import logging
585
+
586
+ def safe_read_file(file_path):
587
+ encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252', 'utf-8-sig']
588
+
589
+ logging.info(f"Attempting to read file: {file_path}")
590
+
591
+ try:
592
+ with open(file_path, 'rb') as file:
593
+ raw_data = file.read()
594
+ except FileNotFoundError:
595
+ logging.error(f"File not found: {file_path}")
596
+ return f"File not found: {file_path}"
597
+ except Exception as e:
598
+ logging.error(f"An error occurred while reading the file: {e}")
599
+ return f"An error occurred while reading the file: {e}"
600
+
601
+ if not raw_data:
602
+ logging.warning(f"File is empty: {file_path}")
603
+ return ""
604
+
605
+ # Use chardet to detect the encoding
606
+ detected = chardet.detect(raw_data)
607
+ if detected['encoding'] is not None:
608
+ encodings.insert(0, detected['encoding'])
609
+ logging.info(f"Detected encoding: {detected['encoding']}")
610
+
611
+ for encoding in encodings:
612
+ try:
613
+ decoded_content = raw_data.decode(encoding)
614
+ # Check if the content is mostly printable
615
+ if sum(c.isprintable() for c in decoded_content) / len(decoded_content) > 0.95:
616
+ logging.info(f"Successfully decoded file with encoding: {encoding}")
617
+ return decoded_content
618
+ except UnicodeDecodeError:
619
+ logging.debug(f"Failed to decode with {encoding}")
620
+ continue
621
+
622
+ # If all decoding attempts fail, return the error message
623
+ logging.error(f"Unable to decode the file {file_path}")
624
+ return f"Unable to decode the file {file_path}"
625
+
626
+
627
+ #
628
+ # End of Files-saving Function Definitions
629
+ #######################################################################################################################
630
+
631
+
632
+ #######################################################################################################################
633
+ #
634
+ # UUID-Functions
635
+
636
+ def generate_unique_filename(base_path, base_filename):
637
+ """Generate a unique filename by appending a counter if necessary."""
638
+ filename = base_filename
639
+ counter = 1
640
+ while os.path.exists(os.path.join(base_path, filename)):
641
+ name, ext = os.path.splitext(base_filename)
642
+ filename = f"{name}_{counter}{ext}"
643
+ counter += 1
644
+ return filename
645
+
646
+
647
+ def generate_unique_identifier(file_path):
648
+ filename = os.path.basename(file_path)
649
+ timestamp = int(time.time())
650
+
651
+ # Generate a hash of the file content
652
+ hasher = hashlib.md5()
653
+ with open(file_path, 'rb') as f:
654
+ buf = f.read()
655
+ hasher.update(buf)
656
+ content_hash = hasher.hexdigest()[:8] # Use first 8 characters of the hash
657
+
658
+ return f"local:{timestamp}:{content_hash}:{filename}"
659
+
660
+ #
661
+ # End of UUID-Functions
662
+ #######################################################################################################################
663
+
664
+
665
+ #######################################################################################################################
666
+ #
667
+ # Backup code
668
+
669
+ #
670
+ # End of backup code
671
+ #######################################################################################################################
672
+
673
+
674
+ #######################################################################################################################
675
+ #
676
+ # Sanitization/Verification Functions
677
+
678
+ # Helper function to validate URL format
679
+ def is_valid_url(url: str) -> bool:
680
+ regex = re.compile(
681
+ r'^(?:http|ftp)s?://' # http:// or https://
682
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
683
+ r'localhost|' # localhost...
684
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
685
+ r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
686
+ r'(?::\d+)?' # optional port
687
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
688
+ return re.match(regex, url) is not None
689
+
690
+
691
+ def verify_checksum(file_path, expected_checksum):
692
+ sha256_hash = hashlib.sha256()
693
+ with open(file_path, 'rb') as f:
694
+ for byte_block in iter(lambda: f.read(4096), b''):
695
+ sha256_hash.update(byte_block)
696
+ return sha256_hash.hexdigest() == expected_checksum
697
+
698
+
699
+ def normalize_title(title, preserve_spaces=False):
700
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
701
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
702
+
703
+ if preserve_spaces:
704
+ # Replace special characters with underscores, but keep spaces
705
+ title = re.sub(r'[^\w\s\-.]', '_', title)
706
+ else:
707
+ # Replace special characters and spaces with underscores
708
+ title = re.sub(r'[^\w\-.]', '_', title)
709
+
710
+ # Replace multiple consecutive underscores with a single underscore
711
+ title = re.sub(r'_+', '_', title)
712
+
713
+ # Replace specific characters with underscores
714
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '_').replace('*', '_').replace(
715
+ '?', '_').replace(
716
+ '<', '_').replace('>', '_').replace('|', '_')
717
+
718
+ return title.strip('_')
719
+
720
+
721
+
722
+ def clean_youtube_url(url):
723
+ parsed_url = urlparse(url)
724
+ query_params = parse_qs(parsed_url.query)
725
+ if 'list' in query_params:
726
+ query_params.pop('list')
727
+ cleaned_query = urlencode(query_params, doseq=True)
728
+ cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
729
+ return cleaned_url
730
+
731
+ def sanitize_filename(filename):
732
+ # Remove invalid characters and replace spaces with underscores
733
+ sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
734
+ sanitized = re.sub(r'\s+', ' ', sanitized).strip()
735
+ return sanitized
736
+
737
+
738
+ def format_transcription(content):
739
+ # Replace '\n' with actual line breaks
740
+ content = content.replace('\\n', '\n')
741
+ # Split the content by newlines first
742
+ lines = content.split('\n')
743
+ formatted_lines = []
744
+ for line in lines:
745
+ # Add extra space after periods for better readability
746
+ line = line.replace('.', '. ').replace('. ', '. ')
747
+
748
+ # Split into sentences using a more comprehensive regex
749
+ sentences = re.split('(?<=[.!?]) +', line)
750
+
751
+ # Trim whitespace from each sentence and add a line break
752
+ formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
753
+
754
+ # Join the formatted sentences
755
+ formatted_lines.append(' '.join(formatted_sentences))
756
+
757
+ # Join the lines with HTML line breaks
758
+ formatted_content = '<br>'.join(formatted_lines)
759
+
760
+ return formatted_content
761
+
762
+ def sanitize_user_input(message):
763
+ """
764
+ Removes or escapes '{{' and '}}' to prevent placeholder injection.
765
+
766
+ Args:
767
+ message (str): The user's message.
768
+
769
+ Returns:
770
+ str: Sanitized message.
771
+ """
772
+ # Replace '{{' and '}}' with their escaped versions
773
+ message = re.sub(r'\{\{', '{ {', message)
774
+ message = re.sub(r'\}\}', '} }', message)
775
+ return message
776
+
777
+ def format_file_path(file_path, fallback_path=None):
778
+ if file_path and os.path.exists(file_path):
779
+ logging.debug(f"File exists: {file_path}")
780
+ return file_path
781
+ elif fallback_path and os.path.exists(fallback_path):
782
+ logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}")
783
+ return fallback_path
784
+ else:
785
+ logging.debug(f"File does not exist: {file_path}. No fallback path available.")
786
+ return None
787
+
788
+ #
789
+ # End of Sanitization/Verification Functions
790
+ #######################################################################################################################
791
+
792
+
793
+ #######################################################################################################################
794
+ #
795
+ # DB Config Loading
796
+
797
+
798
+ def get_db_config():
799
+ # Get the directory of the current script
800
+ current_dir = os.path.dirname(os.path.abspath(__file__))
801
+ # Go up two levels to the project root directory (tldw)
802
+ project_root = os.path.dirname(os.path.dirname(current_dir))
803
+ # Construct the path to the config file
804
+ config_path = os.path.join(project_root, 'Config_Files', 'config.txt')
805
+ # Read the config file
806
+ config = configparser.ConfigParser()
807
+ config.read(config_path)
808
+ # Return the database configuration
809
+ return {
810
+ 'type': config['Database']['type'],
811
+ 'sqlite_path': config.get('Database', 'sqlite_path', fallback='./Databases/media_summary.db'),
812
+ 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
813
+ 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
814
+ }
815
+
816
+
817
+
818
+
819
+ #
820
+ # End of DB Config Loading
821
+ #######################################################################################################################
822
+
823
+ def format_text_with_line_breaks(text):
824
+ # Split the text into sentences and add line breaks
825
+ sentences = text.replace('. ', '.<br>').replace('? ', '?<br>').replace('! ', '!<br>')
826
+ return sentences
827
+
828
+ #######################################################################################################################
829
+ #
830
+ # File Handling Functions
831
+
832
+ # Track temp files for cleanup
833
+ temp_files = []
834
+ temp_file_paths = []
835
+
836
+ def save_temp_file(file):
837
+ global temp_files
838
+ temp_dir = tempfile.gettempdir()
839
+ temp_path = os.path.join(temp_dir, file.name)
840
+ with open(temp_path, 'wb') as f:
841
+ f.write(file.read())
842
+ temp_files.append(temp_path)
843
+ return temp_path
844
+
845
+ def cleanup_temp_files():
846
+ global temp_files
847
+ for file_path in temp_files:
848
+ if os.path.exists(file_path):
849
+ try:
850
+ os.remove(file_path)
851
+ logging.info(f"Removed temporary file: {file_path}")
852
+ except Exception as e:
853
+ logging.error(f"Failed to remove temporary file {file_path}: {e}")
854
+ temp_files.clear()
855
+
856
+ def generate_unique_id():
857
+ return f"uploaded_file_{uuid.uuid4()}"
858
+
859
+ #
860
+ # End of File Handling Functions
861
+ #######################################################################################################################