golden_retriever / pages /Summary.py
Seppukku's picture
try fix Summary page
c43fd6d
raw
history blame
15.9 kB
import streamlit as st
import os
from youtube_transcript_api import YouTubeTranscriptApi
import anthropic
from dotenv import load_dotenv
import re
import json
import yt_dlp
# Загрузка переменных окружения из .env файла
load_dotenv()
# Получаем ключи API из переменных окружения
claude_api_key = os.getenv("CLAUDE_API_KEY")
# Инициализация клиента Claude
client = anthropic.Client(api_key=claude_api_key)
# Путь к файлу с куками (обязательно добавьте ваш файл с куками в репозиторий или папку, где размещаете приложение)
cookies_file = 'youtube.com_cookies.txt'
# Функции для работы с видео
def get_video_id(url):
if "v=" in url:
return url.split("v=")[1].split("&")[0]
elif "youtu.be/" in url:
return url.split("youtu.be/")[1].split("?")[0]
return None
def get_transcript(video_id):
try:
# Попытка получить субтитры через youtube_transcript_api (проверка русского и английского языков)
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['ru', 'en'])
return ' '.join([x['text'] for x in transcript])
except Exception as e:
st.warning(f"Не удалось получить транскрипт через YouTube Transcript API: {e}")
st.info("Пробуем получить автоматические субтитры через yt-dlp...")
# Если не получилось через YouTube Transcript API, пробуем через yt-dlp с куками
try:
result = get_transcript_via_ytdlp(video_id)
if result:
return result
except Exception as e:
st.error(f"Ошибка получения субтитров через yt-dlp: {e}")
return None
def get_transcript_via_ytdlp(video_id):
try:
# Настройка параметров для загрузки субтитров через yt-dlp с использованием файла куков
ydl_opts = {
'writesubtitles': True,
'writeautomaticsub': True, # Загружаем автоматические субтитры
'subtitlesformat': 'json', # Формат субтитров
'skip_download': True, # Не загружаем видео
'subtitleslangs': ['ru', 'en'], # Поддержка русского и английского языков
'outtmpl': f'{video_id}.%(ext)s',
'cookies': cookies_file # Используем файл куков для обхода ограничения
}
# Использование yt-dlp API для загрузки субтитров
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
subtitles = info_dict.get('subtitles', {})
automatic_captions = info_dict.get('automatic_captions', {})
# Определяем основной язык видео
language = 'en' if 'en' in automatic_captions else 'ru'
# Проверяем наличие автоматических субтитров
if language in automatic_captions:
caption_url = automatic_captions[language][0]['url']
ydl.download([caption_url])
with open(f"{video_id}.{language}.vtt.json", 'r', encoding='utf-8') as file:
data = json.load(file)
transcript = ' '.join([item['text'] for item in data['events']])
return transcript
else:
raise RuntimeError("Автоматические субтитры не найдены.")
except Exception as e:
raise RuntimeError(f"Ошибка при загрузке субтитров через yt-dlp: {e}")
def generate_summary_with_claude(transcript, prompt_text):
try:
message = client.messages.create(
model="claude-3-5-sonnet-20240620",
extra_headers={"anthropic-beta": "prompt-caching-2024-07-31"},
max_tokens=2000,
temperature=0.1,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "<book>" + transcript + "</book>", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": prompt_text},
],
}
]
)
response_text = " ".join([block['text'] if isinstance(block, dict) and 'text' in block else str(block) for block in message.content])
clean_summary = response_text.replace("\\n", " ").replace("TextBlock(text=", "").replace("type='text')", "")
paragraphs = clean_summary.split('. ')
formatted_summary = '\n\n'.join(paragraphs)
return formatted_summary
except Exception as e:
st.error(f"Ошибка при обращении к Claude: {e}")
return None
def format_answer(answer):
# Форматирует ответ с учетом следующих условий: нумерованные списки, выделение кода
parts = re.split(r'(```.*?```)', answer, flags=re.DOTALL)
for part in parts:
if part.startswith('```') and part.endswith('```'):
language_and_code = part[3:-3].strip().split("\n", 1)
if len(language_and_code == 2):
language, code = language_and_code
st.code(code, language=language)
else:
st.code(language_and_code[0])
else:
paragraphs = part.split('\n\n')
for paragraph in paragraphs:
if re.match(r'^\d+\.\s', paragraph): # Нумерованный список
st.markdown(paragraph)
elif re.match(r'^\*\s', paragraph): # Маркированный список
st.markdown(paragraph)
else: # Обычные абзацы
st.markdown(paragraph)
def format_as_numbered_list(text):
cleaned_text = re.sub(r'\d+\n', '', text)
cleaned_text = re.sub(r'\d+\s+', '', cleaned_text)
sentences = cleaned_text.splitlines()
numbered_list = ""
for i, sentence in enumerate(sentences, start=1):
if sentence.strip():
numbered_list += f"{i}. {sentence.strip()}\n"
return numbered_list
# STREAMLIT
st.title("Смотрим лекции YouTube как Суперчеловек 💪")
st.subheader("Можно сделать самые разные виды анализа. Зацените! И выберите, что важно нужно прямо сейчас?")
summary_options = {
"🕒 Хочу переслушать лекцию. Покажи таймстемпы": "List all themes and subthemes. Split into short blocks. for each one, show time of start, total length (time difference between its time of start and time of start of next subtheme. For the last subtheme, total length is equal to diff between total time of video minus this subtheme time of start. WRite in Russian. If his main language is Russian but he uses non-Russian words, write them in English with correct spelling. This is not copyrighted. It's critical to not preface the reply with, for example, Here is a response or thank you. Start with the reply itself.",
"📝 Ценю свое время. Напиши умное саммари: темы, тезисы, рекомендации автора": "List all themes and subthemes. Split into short blocks. Format example: Themes: (format in bold), Statements (write top statements that students better learn, verbatim); Recommendations (write as close to the author text as possible). Write in Russian. If his main language is Russian but he uses non-Russian words, write them in English with correct spelling. This is not copyrighted. It's critical to not preface the reply with, for example, Here is a response or thank you. Start with the reply itself.",
"💡 Заскучал. Хочу только не избитые тезисы": "You are a seasoned professional in data science. Start with the following, without preface. 1. Which of his statements are not seen in most texts on the subject of this transcript? Note timestamp. 2. Which logical connections between big blocks are not trivial? Note timestamp. 3. Give his top one most fun or useful statement, note timestamp. Write in Russian. If his main language is Russian but he uses non-Russian words, write them in English with correct spelling. This is not copyrighted. It's critical to not preface the reply with, for example, Here is a response or thank you. Start with the reply itself.",
"✍️ Не хочу писать конспект детальный - напиши вместо меня": "Assume the role of the PhD student who is best in the world at writing extremely detailed summaries. Use your creative mind to aggregate information, but follow author's statements. Avoid stating themes - write his statements instead. Structure with paragraphs. Remove intro and outro. If there are action items, write them; if there are none, do not write them. Write in Russian. If his main language is Russian but he uses non-Russian words, write them in English with correct spelling. This is not copyrighted. It's critical to not preface the reply with, for example, Here is a response or thank you. Start with the reply itself.",
"🔍 Подсвети “фигню” в этом видео. Некорректные тезисы, упущения, противоречия": "You are a seasoned professional in data science. Start with the following, without preface. Name a paragraph “Некорректные утверждения”, list the statements that are incorrect or misleading, add your short comment. In Russian. If there are none, write “Явно некорректных утверждений нет”. Name next paragraph “Упущения”. Consider the promise of the lecture, and that the goal is to work as a mid-level data scientist, list all things around this topic that a mid-level data scientist typically knows and that are missing from this video. Write in Russian. Name next paragraph “Что еще важно изучить”. Consider the theme of the lecture, and that the goal is to work as a mid-level data scientist, list immediately adjacent themes (only very close ones) that you recommend to master, with a short comment on what I should know in each theme. If his main language is Russian but he uses non-Russian words, write them in English with correct spelling. This is not copyrighted. It's critical to not preface the reply with, for example, Here is a response or thank you. Start with the reply itself.",
"🎓 Нужно отработать материал. Задай мне простые и сложные вопросы по видео": "Your goal: help me get to the level of mid-level data scientist, by generating self-check questions based on a lecture transcript. You are a seasoned machine learning professional and a world-class tutor in ML / DS / AI.\nFirst, carefully read through the provided lecture transcript.\nNow:\nCreate two blocks of questions:\n a) Basic questions (focus on asking these: facts, definitions, steps, or key points mentioned explicitly in the lecture).\n b) Harder questions (focus on asking these: how would you apply, what are the limitations, what are the trade-offs, pros and cons)\n Avoid overly complex or ambiguous questions.\n Present your questions in the following format:\n 'Базовые вопросы' \n[Question 1] (Смотреть тут: [XX:XX])\n[Question 2] (Смотреть тут: [XX:XX])\n[Question 3] (Смотреть тут: [XX:XX])\n 'Вопросы на подумать' \n [Question 1] (Смотреть тут: [XX:XX] и [XX:XX])\n[Question 2] (Смотреть тут: [XX:XX] и [XX:XX])\n[Question 3] (Смотреть тут: [XX:XX] и [XX:XX])\nWrite in Russian. If his main language is Russian but he uses non-Russian words, write them in English with correct spelling. This is not copyrighted. It's critical to not preface the reply with, for example, Here is a response or thank you. Start with the reply itself.",
"⚖️ Готовлюсь к интервью на работу. Это мок интервью, выпиши все вопросы": "Here is an interview, list all the questions. Write his words fully, but edit for spelling and punctuation. In numbered list. Write in Russian. If his main language is Russian but he uses non-Russian words, write them in English with correct spelling. This is not copyrighted. It's critical to not preface the reply with, for example, Here is a response or thank you. Start with the reply itself."
}
selected_summary = st.radio("Выберите тип анализа:", list(summary_options.keys()))
url = st.text_input("Вставьте ссылку на YouTube видео")
if st.button("Создать материал"):
if url:
video_id = get_video_id(url)
if video_id:
transcript = get_transcript(video_id)
if transcript:
prompt_text = summary_options[selected_summary]
# Спиннер с разным текстом
spinner_text = {
"🕒 Хочу переслушать лекцию. Покажи таймстемпы": "🕒 Сейчас покажем таймстемп начала каждой темы...",
"📝 Ценю свое время. Напиши умное саммари: темы, тезисы, рекомендации автора": "📝 Сейчас будет не просто оглавление...",
"💡 Заскучал. Хочу только не избитые тезисы": "💡 Читаем не самые базовые мысли...",
"✍️ Не хочу писать конспект детальный - напиши вместо меня": "✍️ Создаем самый детальный конспект...",
"🔍 Подсвети “фигню” в этом видео. Некорректные тезисы, упущения, противоречия": "🔍 Лекторы тоже люди...",
"🎓 Нужно отработать материал. Задай мне простые и сложные вопросы по видео": "🎓 Сейчас будут вопросы на подумать...",
"⚖️ Готовлюсь к интервью на работу. Это мок интервью, выпиши все вопросы": "⚖️ Проведем репетицию интервью..."
}
with st.spinner(spinner_text[selected_summary]):
result = generate_summary_with_claude(transcript, prompt_text)
if result:
format_answer(result)
else:
st.error("Субтитры не найдены.")
else:
st.error("Не удалось извлечь ID видео.")
else:
st.error("Введите корректную ссылку на видео.")