diff --git "a/stop_words.ipynb" "b/stop_words.ipynb" --- "a/stop_words.ipynb" +++ "b/stop_words.ipynb" @@ -2,37 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to\n", - "[nltk_data] C:\\Users\\info\\AppData\\Roaming\\nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - }, - { - "ename": "IndexError", - "evalue": "string index out of range", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[13], line 52\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;66;03m# Metinleri temizleme işlemi ve çıkarılan kelimeleri toplama\u001b[39;00m\n\u001b[0;32m 51\u001b[0m all_removed_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n\u001b[1;32m---> 52\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtemizlenmis_metin\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmetinler\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mclean_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcustom_stop_words\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 53\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mremoved_words\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetinler\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: clean_text(x, custom_stop_words)[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m 54\u001b[0m all_removed_words\u001b[38;5;241m.\u001b[39mupdate(\u001b[38;5;241m*\u001b[39mdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mremoved_words\u001b[39m\u001b[38;5;124m'\u001b[39m])\n", - "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\pandas\\core\\series.py:4924\u001b[0m, in \u001b[0;36mSeries.apply\u001b[1;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[0;32m 4789\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[0;32m 4790\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 4791\u001b[0m func: AggFuncType,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4796\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 4797\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[0;32m 4798\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 4799\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[0;32m 4800\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4915\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[0;32m 4916\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m 4917\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 4918\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4919\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4920\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4921\u001b[0m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4922\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4923\u001b[0m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m-> 4924\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\pandas\\core\\apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1424\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[0;32m 1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[1;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\pandas\\core\\apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[0;32m 1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[0;32m 1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[0;32m 1505\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[0;32m 1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1508\u001b[0m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[0;32m 1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[0;32m 1512\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[0;32m 1513\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[0;32m 1514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n", - "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\pandas\\core\\base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[1;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[0;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[1;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[1;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1743\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[0;32m 1746\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[0;32m 1747\u001b[0m )\n", - "File \u001b[1;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[1;34m()\u001b[0m\n", - "Cell \u001b[1;32mIn[13], line 52\u001b[0m, in \u001b[0;36m\u001b[1;34m(x)\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;66;03m# Metinleri temizleme işlemi ve çıkarılan kelimeleri toplama\u001b[39;00m\n\u001b[0;32m 51\u001b[0m all_removed_words \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n\u001b[1;32m---> 52\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtemizlenmis_metin\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetinler\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[43mclean_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcustom_stop_words\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m)\n\u001b[0;32m 53\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mremoved_words\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetinler\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: clean_text(x, custom_stop_words)[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m 54\u001b[0m all_removed_words\u001b[38;5;241m.\u001b[39mupdate(\u001b[38;5;241m*\u001b[39mdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mremoved_words\u001b[39m\u001b[38;5;124m'\u001b[39m])\n", - "\u001b[1;31mIndexError\u001b[0m: string index out of range" - ] - } - ], + "outputs": [], "source": [ "# Temizleme işlemini gerçekleştirin\n", "import re\n", @@ -10218,6 +10190,9781 @@ "df_temiz.to_csv('temizlenmis_metin2.csv', index=False)\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TF-Idf ile metinlerdeki anahtar kelimeleri 10'ar keyworde indirdik ve skorlarını aldık " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['bağlantılar', 'leh'] not in stop_words.\n", + " warnings.warn(\n", + "10000it [15:12, 10.96it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sonuçlar 'keywords.csv' dosyasına kaydedildi.\n" + ] + }, + { + "data": { + "text/plain": [ + "'\\n# Sonuçları görüntüleme\\nfor i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\\n print(f\"Döküman {i+1}:\")\\n for keyword, score in zip(keywords, scores):\\n print(f\"{keyword}: {score:.4f}\")\\n print(\"\\n\")\\n'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import csv\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from joblib import Parallel, delayed\n", + "from tqdm import tqdm\n", + "\n", + "\n", + "def extract_keywords_tfidf(metinler, stop_words_list, top_n=10, n_jobs=-1):\n", + " \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır ve paralel işlem yapar.\"\"\"\n", + " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n", + "\n", + "\n", + "\n", + " # TF-IDF matrisini oluşturma (CPU kullanımını optimize etmek için n_jobs kullanılır)\n", + " X = vectorizer.fit_transform(metinler) # bunu csv'den okudum\n", + " feature_names = vectorizer.get_feature_names_out() # Her kelimenin tf-idf vektöründeki karşılığını tutar\n", + "\n", + " # tf-idf değerlerini paralel işlemeyle bulma\n", + " def process_row(row):\n", + " tfidf_scores = row.toarray().flatten() # Düz bir değişken haline getirme\n", + " top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n", + "\n", + " # En yüksek skorlu kelimeleri ve skorları bul\n", + " top_keywords = [feature_names[i] for i in top_indices]\n", + " top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n", + " return top_keywords, top_tfidf_scores\n", + "\n", + " results = Parallel(n_jobs=n_jobs)(delayed(process_row)(row) for row in tqdm(X))\n", + "\n", + " # Sonuçları listelere ayırma\n", + " top_keywords_per_document, top_tfidf_scores_per_document = zip(*results)\n", + "\n", + " return top_keywords_per_document, top_tfidf_scores_per_document\n", + "\n", + "\n", + "\n", + "df = pd.read_csv(\"temizlenmis_metin.csv\", encoding='utf-8', on_bad_lines='skip')\n", + "metinler = df[\"temizlenmis_metin\"].fillna(\"\").tolist()\n", + "\n", + "\n", + "with open(\"gereksiz_kelimeler.txt\", \"r\", encoding='utf-8') as f:\n", + " stop_words_list = f.read().splitlines()\n", + " \n", + "\n", + "combined_sample = metinler[:10000]\n", + "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined_sample, stop_words_list, top_n=10, n_jobs=-1)\n", + "#n__jobs ın 2 olması aynı anda iki iş parçacığı yani iki işlem yanı anda yürütülür\n", + "#n__jobs ın -1 olması maksimum işlemci sayısının kullanılmasıdır.\n", + "\n", + "\n", + "with open('keywords.csv', mode='w', newline='', encoding='utf-8') as file:\n", + " writer = csv.writer(file)\n", + " # Başlık satırını yazma\n", + " writer.writerow(['Document_Index'] + [f'Keyword_{i+1}' for i in range(10)] + [f'Score_{i+1}' for i in range(10)])\n", + "\n", + " # Her döküman için anahtar kelimeler ve skorları yazma\n", + " for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n", + " row = [i+1] + keywords + [f\"{score:.4f}\" for score in scores]\n", + " writer.writerow(row)\n", + "\n", + "print(\"Sonuçlar 'keywords.csv' dosyasına kaydedildi.\")\n", + "\"\"\"\n", + "# Sonuçları görüntüleme\n", + "for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n", + " print(f\"Döküman {i+1}:\")\n", + " for keyword, score in zip(keywords, scores):\n", + " print(f\"{keyword}: {score:.4f}\")\n", + " print(\"\\n\")\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Toplam metin sayısı: 10000\n", + "Keyword_6 sütununda NaN değerler var.\n", + "Keyword_9 sütununda NaN değerler var.\n", + "Toplam anahtar kelime sayısı: 100000\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'text_vector' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[23], line 72\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx, row \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(similarity_matrix):\n\u001b[0;32m 71\u001b[0m sorted_indices \u001b[38;5;241m=\u001b[39m row\u001b[38;5;241m.\u001b[39margsort()[::\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;66;03m# Benzerlik oranına göre büyükten küçüğe sıralama\u001b[39;00m\n\u001b[1;32m---> 72\u001b[0m text_tfidf_array \u001b[38;5;241m=\u001b[39m \u001b[43mtext_vector\u001b[49m\u001b[38;5;241m.\u001b[39mtoarray()\u001b[38;5;241m.\u001b[39mflatten()\n\u001b[0;32m 73\u001b[0m top_keywords \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m 74\u001b[0m top_scores \u001b[38;5;241m=\u001b[39m []\n", + "\u001b[1;31mNameError\u001b[0m: name 'text_vector' is not defined" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "\n", + "def clean_text(text):\n", + " if pd.isna(text):\n", + " return ''\n", + " return text\n", + "\n", + "# CSV dosyasından belirli sayıda metin okuma\n", + "def get_text(file_path='temizlenmis_metin.csv', num_rows=10000):\n", + " df = pd.read_csv(file_path, nrows=num_rows)\n", + " return df['temizlenmis_metin'].tolist()\n", + "\n", + "# Anahtar kelimeleri okuma\n", + "def get_keywords(file_path='keywords.csv',num_keywords=100000,fill_value='geçersiz'):\n", + " df = pd.read_csv(file_path)\n", + " # NaN değerlerini boş stringlerle doldurma\n", + " \n", + "\n", + " \n", + " keyword_columns = ['Keyword_1','Keyword_2','Keyword_3','Keyword_4','Keyword_5']\n", + " score_columns = ['Score_1,Score_2,Score_3,Score_4','Score_5']\n", + "\n", + " \n", + " # NaN değerlerinin hangi sütunlarda olduğunu tespit etme\n", + " nan_columns = df[keyword_columns].isna().any()\n", + "\n", + " # Hangi sütunlarda NaN değerler olduğunu yazdırma\n", + " for col, has_nan in nan_columns.items():\n", + " if has_nan:\n", + " print(f\"{col} sütununda NaN değerler var.\")\n", + "\n", + " # NaN değerlerini belirli bir değerle doldurma (örneğin, boş string)\n", + " df[keyword_columns] = df[keyword_columns].fillna(fill_value)\n", + " df[score_columns]=df[score_columns].fillna(fill_value)\n", + " \n", + " keywords = df[keyword_columns].values.flatten().tolist() # Anahtar kelimeleri tek bir listeye dönüştürün\n", + " score= df[score_columns].values.flatten().tolist() \n", + " #keywords = [kw for kw in keywords if pd.notna(kw)] # NaN olmayan değerleri alın\n", + " # Anahtar kelimelerin sayısını belirli bir limite göre sınırlayın\n", + " keywords = keywords[:num_keywords]\n", + "\n", + " return keywords\n", + "\n", + "texts = get_text(num_rows=10000)\n", + "\n", + "# texts listesindeki np.nan değerlerini boş string ile değiştirme\n", + "texts = [text if isinstance(text, str) else \"\" for text in texts]\n", + "print(f\"Toplam metin sayısı: {len(texts)}\")\n", + "\n", + "\n", + "keywords = get_keywords(num_keywords=100000)\n", + "print(f\"Toplam anahtar kelime sayısı: {len(keywords)}\")\n", + "\n", + "score = get_keywords(num_keywords=100000)\n", + "print(f\"Toplam anahtar kelime sayısı: {len(keywords)}\")\n", + "\n", + "# TF-IDF vektörleştiricisini oluşturma ve metinler ile anahtar kelimeleri encode etme\n", + "tfidf_vectorizer = TfidfVectorizer()\n", + "text_tfidf = tfidf_vectorizer.fit_transform(texts)\n", + "keyword_tfidf = tfidf_vectorizer.transform(keywords)\n", + "\n", + "# Benzerlik hesaplama\n", + "similarity_matrix = cosine_similarity(text_tfidf, keyword_tfidf)\n", + "\n", + "\n", + "\n", + "# Her metin için en yüksek benzerlik oranına sahip anahtar kelimeleri bulma\n", + "top_n =5 # İlk 5 anahtar kelimeyi alacağız\n", + "top_keywords_per_document = []\n", + "\n", + "for idx, row in enumerate(similarity_matrix):\n", + " sorted_indices = row.argsort()[::-1] # Benzerlik oranına göre büyükten küçüğe sıralama\n", + " text_tfidf_array = text_vector.toarray().flatten()\n", + " top_keywords = []\n", + " top_scores = []\n", + " scores_for_text = (text_tfidf_array @ scores.T).flatten()\n", + " # En yüksek skorlara sahip anahtar kelimeleri seçme\n", + " sorted_indices = scores_for_text.argsort()[::-1]\n", + " used_keywords=set()\n", + "\n", + " for i in sorted_indices:\n", + " if len(top_keywords) >= top_n:\n", + " break\n", + " # Her bir skor için en yüksek 5 anahtar kelimeyi seçin\n", + " top_scores_for_keyword = scores[i]\n", + " top_indices_for_keyword = top_scores_for_keyword.argsort()[::-1]\n", + " for j in top_indices_for_keyword:\n", + " keyword = keywords[i, j]\n", + " score = top_scores_for_keyword[j]\n", + " if len(top_keywords) >= top_n:\n", + " break\n", + " top_keywords.append(keyword)\n", + " top_scores.append(score)\n", + "\n", + " top_keywords_per_document.append((idx + 1, list(zip(top_keywords, top_scores))))\n", + "\n", + "\n", + "# Sonuçları yazdırma\n", + "for doc_id, keywords_scores in top_keywords_per_document:\n", + " print(f\"Metin {doc_id}:\")\n", + " for keyword, score in keywords_scores:\n", + " print(f\"Keyword: {keyword}, Similarity: {score:.4f}\")\n", + " print(\"\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Toplam metin sayısı: 10000\n", + "Toplam anahtar kelime sayısı: 50000\n" + ] + }, + { + "data": { + "text/plain": [ + "'# Sonuçları yazdırma\\nfor doc_id, keywords_scores in top_keywords_per_document:\\n print(f\"Metin {doc_id}:\")\\n for keyword, score in keywords_scores:\\n print(f\"Keyword: {keyword}, Similarity: {score:.4f}\")\\n print(\"\\n\")'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "def clean_text(text):\n", + " return '' if pd.isna(text) else text\n", + "\n", + "# CSV dosyasından belirli sayıda metin okuma\n", + "def get_text(file_path='temizlenmis_metin.csv', num_rows=10000):\n", + " df = pd.read_csv(file_path, nrows=num_rows)\n", + " return df['temizlenmis_metin'].tolist()\n", + "\n", + "# Anahtar kelimeleri ve skorları okuma\n", + "def get_keywords_and_scores(file_path='keywords.csv', num_keywords=5000, fill_value='geçersiz'):\n", + " df = pd.read_csv(file_path)\n", + " keyword_columns = ['Keyword_1', 'Keyword_2', 'Keyword_3', 'Keyword_4', 'Keyword_5']\n", + " score_columns = ['Score_1', 'Score_2', 'Score_3', 'Score_4', 'Score_5']\n", + "\n", + " # NaN değerlerini doldurma\n", + " df[keyword_columns] = df[keyword_columns].fillna(fill_value)\n", + " df[score_columns] = df[score_columns].fillna(0)\n", + "\n", + " # Anahtar kelimeler ve skorları alma\n", + " keywords = df[keyword_columns].values.flatten().tolist()[:num_keywords]\n", + " scores = df[score_columns].values.flatten().tolist()[:num_keywords]\n", + "\n", + " return keywords, scores\n", + "\n", + "texts = get_text(num_rows=10000)\n", + "\n", + "# texts listesindeki NaN değerlerini temizleme\n", + "texts = [clean_text(text) for text in texts]\n", + "print(f\"Toplam metin sayısı: {len(texts)}\")\n", + "\n", + "keywords, scores = get_keywords_and_scores(num_keywords=50000)\n", + "print(f\"Toplam anahtar kelime sayısı: {len(keywords)}\")\n", + "\n", + "# TF-IDF vektörleştiricisini oluşturma ve metinler ile anahtar kelimeleri encode etme\n", + "tfidf_vectorizer = TfidfVectorizer()\n", + "text_tfidf = tfidf_vectorizer.fit_transform(texts)\n", + "keyword_tfidf = tfidf_vectorizer.transform(keywords)\n", + "\n", + "# Benzerlik hesaplama\n", + "similarity_matrix = cosine_similarity(text_tfidf, keyword_tfidf)\n", + "\n", + "# Her metin için en yüksek benzerlik oranına sahip 5 farklı anahtar kelimeyi bulma\n", + "top_n = 5 # İlk 5 anahtar kelimeyi alacağız\n", + "top_keywords_per_document = []\n", + "\n", + "for idx, row in enumerate(similarity_matrix):\n", + " sorted_indices = row.argsort()[::-1] # Benzerlik oranına göre büyükten küçüğe sıralama\n", + " selected_keywords = set() # Farklı anahtar kelimeleri takip etmek için set kullanıyoruz\n", + " top_keywords = []\n", + " \n", + " for i in sorted_indices:\n", + " keyword = keywords[i]\n", + " score = row[i]\n", + " \n", + " if keyword not in selected_keywords and len(top_keywords) < top_n:\n", + " top_keywords.append((keyword, score))\n", + " selected_keywords.add(keyword)\n", + " \n", + " if len(top_keywords) >= top_n:\n", + " break\n", + "\n", + " top_keywords_per_document.append({\n", + " \"title\": f\"Metin {idx + 1}\", # Eğer başlıklar varsa, buraya başlıkları ekleyin\n", + " \"text\": texts[idx],\n", + " \"keywords\": top_keywords\n", + " })\n", + "\n", + "# Sonuçları DataFrame'e dönüştürme ve CSV'ye yazma\n", + "results_df = pd.DataFrame(top_keywords_per_document)\n", + "results_df.to_csv('top_keywords_per_document.csv', index=False)\n", + "\n", + "\"\"\"# Sonuçları yazdırma\n", + "for doc_id, keywords_scores in top_keywords_per_document:\n", + " print(f\"Metin {doc_id}:\")\n", + " for keyword, score in keywords_scores:\n", + " print(f\"Keyword: {keyword}, Similarity: {score:.4f}\")\n", + " print(\"\\n\")\"\"\"\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pymongo import MongoClient\n", + "\n", + "# MongoDB bağlantısı\n", + "client = MongoClient('mongodb://localhost:27017/') # MongoDB sunucunuza bağlanın\n", + "db = client['EgitimDatabase'] # Kullanmak istediğiniz veritabanı adı\n", + "collection = db['test'] # Güncellemek istediğiniz koleksiyon adı\n", + "\n", + "# MongoDB'den metinleri alma\n", + "mongo_texts = collection.find({}, {'title': 1, 'text': 1, '_id': 0})\n", + "mongo_texts_dict = {doc['title']: doc['text'] for doc in mongo_texts}\n", + "\n", + "# CSV dosyasını okuma\n", + "results_df = pd.read_csv('top_keywords_per_document.csv')\n", + "\n", + "# Başlıkları ve metinleri eşleştirme\n", + "# Başlıkları eşleştirme\n", + "title_mapping = {f\"Metin {i+1}\": title for i, title in enumerate(mongo_texts_dict.keys())}\n", + "\n", + "# Metinleri eşleştirme\n", + "text_mapping = {title: mongo_texts_dict[title] for title in title_mapping.values()}\n", + "\n", + "# Başlıkları ve metinleri CSV'ye ekleme\n", + "results_df['title'] = results_df['title'].map(title_mapping).fillna(results_df['title'])\n", + "results_df['text'] = results_df['text'].map(text_mapping).fillna(results_df['text'])\n", + "\n", + "# Güncellenmiş CSV dosyasını kaydetme\n", + "results_df.to_csv('top_keywords_per_document_updated.csv', index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\huggingface_hub\\file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n", + " _torch_pytree._register_pytree_node(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'sequence': 'Angela Merkel ist eine Politikerin in Deutschland und Vorsitzende der CDU', 'labels': ['politics', 'economy', 'environment', 'entertainment'], 'scores': [0.9658799767494202, 0.022846756502985954, 0.0073339128866791725, 0.003939332906156778]}\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from transformers import AutoModel, AutoTokenizer,pipeline\n", + "import torch\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "\n", + "classifier = pipeline('zero-shot-classification', model='MoritzLaurer/mDeBERTa-v3-base-mnli-xnli')\n", + "\n", + "# Load model directly\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli\")\n", + "model = AutoModelForSequenceClassification.from_pretrained(\"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli\")\n", + "\n", + "sequence_to_classify = \"Angela Merkel ist eine Politikerin in Deutschland und Vorsitzende der CDU\"\n", + "candidate_labels = [\"politics\", \"economy\", \"entertainment\", \"environment\"]\n", + "output = classifier(sequence_to_classify, candidate_labels, multi_label=False)\n", + "print(output)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cee197a844ac4964931f69b9827defef", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/26.0 [00:00=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[30], line 33\u001b[0m\n\u001b[0;32m 30\u001b[0m dataloader \u001b[38;5;241m=\u001b[39m DataLoader(dataset, batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 32\u001b[0m \u001b[38;5;66;03m# Eğitim parametreleri\u001b[39;00m\n\u001b[1;32m---> 33\u001b[0m training_args \u001b[38;5;241m=\u001b[39m \u001b[43mTrainingArguments\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m./results\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Çıktı dizini belirtin\u001b[39;49;00m\n\u001b[0;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mper_device_train_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43mper_device_eval_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 37\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_train_epochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[43mlogging_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m./logs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 39\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m 41\u001b[0m \u001b[38;5;66;03m# Trainer'ı oluşturun ve eğitimi başlatın\u001b[39;00m\n\u001b[0;32m 42\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[0;32m 43\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[0;32m 44\u001b[0m args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[0;32m 45\u001b[0m train_dataset\u001b[38;5;241m=\u001b[39mdataset,\n\u001b[0;32m 46\u001b[0m )\n", + "File \u001b[1;32m:121\u001b[0m, in \u001b[0;36m__init__\u001b[1;34m(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha)\u001b[0m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\training_args.py:1493\u001b[0m, in \u001b[0;36mTrainingArguments.__post_init__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1487\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m version\u001b[38;5;241m.\u001b[39mparse(version\u001b[38;5;241m.\u001b[39mparse(torch\u001b[38;5;241m.\u001b[39m__version__)\u001b[38;5;241m.\u001b[39mbase_version) \u001b[38;5;241m==\u001b[39m version\u001b[38;5;241m.\u001b[39mparse(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2.0.0\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp16:\n\u001b[0;32m 1488\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--optim adamw_torch_fused with --fp16 requires PyTorch>2.0\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 1490\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 1491\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1492\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m is_torch_available()\n\u001b[1;32m-> 1493\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 1494\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnpu\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 1495\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxpu\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 1496\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (get_xla_device_type(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGPU\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 1497\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp16 \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp16_full_eval)\n\u001b[0;32m 1498\u001b[0m ):\n\u001b[0;32m 1499\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1500\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1501\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1502\u001b[0m )\n\u001b[0;32m 1504\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 1505\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1506\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m is_torch_available()\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1513\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbf16 \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbf16_full_eval)\n\u001b[0;32m 1514\u001b[0m ):\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\training_args.py:1941\u001b[0m, in \u001b[0;36mTrainingArguments.device\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1937\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1938\u001b[0m \u001b[38;5;124;03mThe device used by this process.\u001b[39;00m\n\u001b[0;32m 1939\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1940\u001b[0m requires_backends(\u001b[38;5;28mself\u001b[39m, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m-> 1941\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_setup_devices\u001b[49m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:54\u001b[0m, in \u001b[0;36mcached_property.__get__\u001b[1;34m(self, obj, objtype)\u001b[0m\n\u001b[0;32m 52\u001b[0m cached \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(obj, attr, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m 53\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m cached \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m---> 54\u001b[0m cached \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;28msetattr\u001b[39m(obj, attr, cached)\n\u001b[0;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cached\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\training_args.py:1841\u001b[0m, in \u001b[0;36mTrainingArguments._setup_devices\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_sagemaker_mp_enabled():\n\u001b[0;32m 1840\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_accelerate_available(min_version\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m0.20.1\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m-> 1841\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[0;32m 1842\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUsing the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1843\u001b[0m )\n\u001b[0;32m 1844\u001b[0m AcceleratorState\u001b[38;5;241m.\u001b[39m_reset_state(reset_partial_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdistributed_state \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "\u001b[1;31mImportError\u001b[0m: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`" + ] + } + ], + "source": [ + "from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments\n", + "import pandas as pd \n", + "import torch\n", + "from torch.utils.data import Dataset, DataLoader\n", + "\n", + "# Tokenizer ve model yükleme\n", + "model_name = 'gpt2'\n", + "model = GPT2LMHeadModel.from_pretrained(model_name)\n", + "tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n", + "\n", + "# Eğitim verisini yükleme\n", + "df = pd.read_csv(\"top_keywords_per_document_updated.csv\")\n", + "\n", + "# Custom Dataset sınıfı\n", + "class CustomDataset(Dataset):\n", + " def __init__(self, dataframe):\n", + " self.dataframe = dataframe\n", + " self.texts = dataframe['text'].tolist()\n", + "\n", + " def __len__(self):\n", + " return len(self.dataframe)\n", + "\n", + " def __getitem__(self, idx):\n", + " text = self.texts[idx]\n", + " tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors=\"pt\")\n", + " return {'input_ids': tokenized['input_ids'].squeeze(), 'attention_mask': tokenized['attention_mask'].squeeze()}\n", + "\n", + "# Dataset ve DataLoader oluşturma\n", + "dataset = CustomDataset(df)\n", + "dataloader = DataLoader(dataset, batch_size=4, shuffle=True)\n", + "\n", + "# Eğitim parametreleri\n", + "training_args = TrainingArguments(\n", + " output_dir='./results', # Çıktı dizini belirtin\n", + " per_device_train_batch_size=4,\n", + " per_device_eval_batch_size=4,\n", + " num_train_epochs=3,\n", + " logging_dir='./logs',\n", + ")\n", + "\n", + "# Trainer'ı oluşturun ve eğitimi başlatın\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=dataset,\n", + ")\n", + "\n", + "trainer.train()\n", + "\n", + "def generate_custom_subheadings(title, keywords, model, tokenizer, num_subheadings=5):\n", + " prompt = f\"Başlık: {title}\\nAnahtar Kelimeler: {', '.join(keywords)}\\nLütfen başlık ve anahtar kelimelerle uyumlu olan ve en az üç anahtar kelimeyi içeren, başlıkla benzer bir kelime içeren {num_subheadings} alt başlık önerisi oluşturun.\"\n", + "\n", + " inputs = tokenizer(prompt, return_tensors='pt')\n", + " outputs = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=num_subheadings, no_repeat_ngram_size=2, early_stopping=True)\n", + " subheadings = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n", + " return subheadings\n", + "\n", + "# Test etme\n", + "title = \"Örnek Başlık\" # Başlık için örnek değer\n", + "keywords = [\"anahtar1\", \"anahtar2\", \"anahtar3\"] # Anahtar kelimeler için örnek değer\n", + "subheadings = generate_custom_subheadings(title, keywords, model, tokenizer)\n", + "for i, subheading in enumerate(subheadings):\n", + " print(f\"Subheading {i+1}: {subheading}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_embeddings(text):\n", + " inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)\n", + " with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " # İlk özniteliklerin ortalamasını alarak vektör elde etme\n", + " embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()\n", + " return embeddings\n", + "\n", + "def generate_subheadings(title, keywords, top_n=5):\n", + " title_embedding = get_embeddings(title)\n", + " keyword_embeddings = [get_embeddings(keyword) for keyword in keywords]\n", + "\n", + " similarities = [cosine_similarity(title_embedding.unsqueeze(0), keyword_embedding.unsqueeze(0))[0][0] for keyword_embedding in keyword_embeddings]\n", + " sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)\n", + "\n", + " # Benzerliklere göre anahtar kelimelerden alt başlıklar oluşturma\n", + " subheadings = [keywords[i] for i in sorted_indices[:top_n]]\n", + " return subheadings\n", + "\n", + "# Örnek veri\n", + "title = \"Veri Bilimi Nedir?\"\n", + "keywords = [\"Makine Öğrenmesi\", \"Büyük Veri\", \"Yapay Zeka\", \"1936\", \"Veri Madenciliği\", \"Derin Öğrenme\", \"Doğal Dil İşleme\", \"1982\", \"Yapay Sinir Ağları\", \"Kümeleme\"]\n", + "\n", + "# Alt başlıkları oluşturma\n", + "subheadings = generate_subheadings(title, keywords)\n", + "print(\"Alt Başlıklar:\")\n", + "for subheading in subheadings:\n", + " print(f\"- {subheading}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n", + " _torch_pytree._register_pytree_node(\n", + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\huggingface_hub\\file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n", + " _torch_pytree._register_pytree_node(\n", + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n", + " _torch_pytree._register_pytree_node(\n", + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\huggingface_hub\\file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b4d7bab59ed6457ab11999cc4943324b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "pytorch_model.bin: 0%| | 0.00/273M [00:00 53\u001b[0m subheadings \u001b[38;5;241m=\u001b[39m \u001b[43mgenerate_subheadings\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtitle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeywords\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 54\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBaşlık: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtitle\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 55\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAlt Başlıklar:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[1;32mIn[21], line 26\u001b[0m, in \u001b[0;36mgenerate_subheadings\u001b[1;34m(title, keywords, top_n)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_subheadings\u001b[39m(title, keywords, top_n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m):\n\u001b[0;32m 23\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 24\u001b[0m \u001b[38;5;124;03m Başlık ve anahtar kelimeleri kullanarak alt başlıklar oluşturur.\u001b[39;00m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m title_embedding \u001b[38;5;241m=\u001b[39m \u001b[43mget_embeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtitle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 27\u001b[0m keyword_embeddings \u001b[38;5;241m=\u001b[39m [get_embeddings(keyword) \u001b[38;5;28;01mfor\u001b[39;00m keyword \u001b[38;5;129;01min\u001b[39;00m keywords]\n\u001b[0;32m 29\u001b[0m \u001b[38;5;66;03m# Başlık ile anahtar kelimeler arasındaki benzerlikleri hesaplama\u001b[39;00m\n", + "Cell \u001b[1;32mIn[21], line 17\u001b[0m, in \u001b[0;36mget_embeddings\u001b[1;34m(text, max_length)\u001b[0m\n\u001b[0;32m 15\u001b[0m inputs \u001b[38;5;241m=\u001b[39m tokenizer(text, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m'\u001b[39m, truncation\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, max_length\u001b[38;5;241m=\u001b[39mmax_length)\n\u001b[0;32m 16\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[1;32m---> 17\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs)\n\u001b[0;32m 18\u001b[0m \u001b[38;5;66;03m# İlk özniteliklerin ortalamasını alarak vektör elde etme\u001b[39;00m\n\u001b[0;32m 19\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mlast_hidden_state\u001b[38;5;241m.\u001b[39mmean(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39msqueeze()\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\models\\distilbert\\modeling_distilbert.py:820\u001b[0m, in \u001b[0;36mDistilBertModel.forward\u001b[1;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 817\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attention_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 818\u001b[0m attention_mask \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mones(input_shape, device\u001b[38;5;241m=\u001b[39mdevice) \u001b[38;5;66;03m# (bs, seq_length)\u001b[39;00m\n\u001b[1;32m--> 820\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransformer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 821\u001b[0m \u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 822\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 823\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 824\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 825\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 826\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 827\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\models\\distilbert\\modeling_distilbert.py:585\u001b[0m, in \u001b[0;36mTransformer.forward\u001b[1;34m(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 577\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[0;32m 578\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[0;32m 579\u001b[0m hidden_state,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 582\u001b[0m output_attentions,\n\u001b[0;32m 583\u001b[0m )\n\u001b[0;32m 584\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 585\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 586\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 587\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 588\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 589\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 590\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 592\u001b[0m hidden_state \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\models\\distilbert\\modeling_distilbert.py:529\u001b[0m, in \u001b[0;36mTransformerBlock.forward\u001b[1;34m(self, x, attn_mask, head_mask, output_attentions)\u001b[0m\n\u001b[0;32m 526\u001b[0m sa_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msa_layer_norm(sa_output \u001b[38;5;241m+\u001b[39m x) \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[0;32m 528\u001b[0m \u001b[38;5;66;03m# Feed Forward Network\u001b[39;00m\n\u001b[1;32m--> 529\u001b[0m ffn_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mffn\u001b[49m\u001b[43m(\u001b[49m\u001b[43msa_output\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[0;32m 530\u001b[0m ffn_output: torch\u001b[38;5;241m.\u001b[39mTensor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_layer_norm(ffn_output \u001b[38;5;241m+\u001b[39m sa_output) \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[0;32m 532\u001b[0m output \u001b[38;5;241m=\u001b[39m (ffn_output,)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\models\\distilbert\\modeling_distilbert.py:464\u001b[0m, in \u001b[0;36mFFN.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 463\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: torch\u001b[38;5;241m.\u001b[39mTensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[1;32m--> 464\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mapply_chunking_to_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mff_chunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchunk_size_feed_forward\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mseq_len_dim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\pytorch_utils.py:242\u001b[0m, in \u001b[0;36mapply_chunking_to_forward\u001b[1;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001b[0m\n\u001b[0;32m 239\u001b[0m \u001b[38;5;66;03m# concatenate output at same dimension\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcat(output_chunks, dim\u001b[38;5;241m=\u001b[39mchunk_dim)\n\u001b[1;32m--> 242\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minput_tensors\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\models\\distilbert\\modeling_distilbert.py:469\u001b[0m, in \u001b[0;36mFFN.ff_chunk\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 467\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlin1(\u001b[38;5;28minput\u001b[39m)\n\u001b[0;32m 468\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mactivation(x)\n\u001b[1;32m--> 469\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlin2\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 470\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(x)\n\u001b[0;32m 471\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m x\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\linear.py:117\u001b[0m, in \u001b[0;36mLinear.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 116\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[1;32m--> 117\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import pandas as pd\n", + "from transformers import AutoModel, AutoTokenizer\n", + "import torch\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "# Model ve tokenizer'ı yükleme\n", + "model_name = 'dbmdz/distilbert-base-turkish-cased'\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModel.from_pretrained(model_name)\n", + "\n", + "def get_embeddings(text, max_length=512):\n", + " \"\"\"\n", + " Metni gömme (embedding) vektörüne dönüştürür.\n", + " \"\"\"\n", + " inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)\n", + " with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " # İlk özniteliklerin ortalamasını alarak vektör elde etme\n", + " embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()\n", + " return embeddings\n", + "\n", + "def generate_subheadings(title, keywords, top_n=5):\n", + " \"\"\"\n", + " Başlık ve anahtar kelimeleri kullanarak alt başlıklar oluşturur.\n", + " \"\"\"\n", + " title_embedding = get_embeddings(title)\n", + " keyword_embeddings = [get_embeddings(keyword) for keyword in keywords]\n", + "\n", + " # Başlık ile anahtar kelimeler arasındaki benzerlikleri hesaplama\n", + " similarities = [cosine_similarity(title_embedding.unsqueeze(0), keyword_embedding.unsqueeze(0))[0][0].item() for keyword_embedding in keyword_embeddings]\n", + " sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)\n", + "\n", + " # Benzerliklere göre anahtar kelimelerden alt başlıklar oluşturma\n", + " subheadings = [keywords[i] for i in sorted_indices[:top_n]]\n", + " return subheadings\n", + "\n", + "# CSV dosyasını okuma\n", + "df = pd.read_csv('top_keywords_per_document.csv')\n", + "\n", + "# Başlıkları ve anahtar kelimeleri içeren sütun isimlerini belirleyin\n", + "title_column = 'title' # Başlıkları içeren sütun\n", + "keywords_column = 'keywords' # Anahtar kelimeleri içeren sütun\n", + "\n", + "# Başlık ve anahtar kelimeleri içeren verileri çekme\n", + "titles = df[title_column].dropna().tolist()\n", + "keywords_list = df[keywords_column].dropna().tolist()\n", + "\n", + "# Anahtar kelimeleri virgülle ayırarak listeye dönüştürme\n", + "keywords_list = [keywords.split(',') for keywords in keywords_list]\n", + "\n", + "# Başlık ve anahtar kelimeleri kullanarak alt başlıkları oluşturma\n", + "for title, keywords in zip(titles, keywords_list):\n", + " subheadings = generate_subheadings(title, keywords)\n", + " print(f\"Başlık: {title}\")\n", + " print(\"Alt Başlıklar:\")\n", + " for subheading in subheadings:\n", + " print(f\"- {subheading}\")\n", + " print(\"\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\huggingface_hub\\file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\modeling_utils.py:519: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " return torch.load(checkpoint_file, map_location=map_location)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Başlık: Metin 1\n", + "Alt Başlıklar:\n", + "- ('engelsin' ('kapital'\n", + "- [('marxın' ('kapital'\n", + "- [('marxın' ('engelsin'\n", + "- ('hegelin' ('kapital'\n", + "- [('marxın' ('hegelin'\n", + "\n", + "\n", + "Başlık: Metin 2\n", + "Alt Başlıklar:\n", + "- ('dostlar' ('pir'\n", + "- [('ruhi' ('dostlar'\n", + "- [('ruhi' ('pir'\n", + "- ('pir' ('sıdıka'\n", + "- ('sunun' ('pir'\n", + "\n", + "\n", + "Başlık: Metin 3\n", + "Alt Başlıklar:\n", + "- [('bilgisayarlar' ('bilgisayarı'\n", + "- [('bilgisayarlar' ('destekli'\n", + "- [('bilgisayarlar' ('dişli'\n", + "- [('bilgisayarlar' ('bilgisayarlı'\n", + "- ('destekli' ('dişli'\n", + "\n", + "\n", + "Başlık: Metin 4\n", + "Alt Başlıklar:\n", + "- [('edebiyatın' ('drama'\n", + "- ('edebî' ('drama'\n", + "- [('edebiyatın' ('edebî'\n", + "- [('edebiyatın' ('edeb'\n", + "- ('edeb' ('drama'\n", + "\n", + "\n", + "Başlık: Metin 5\n", + "Alt Başlıklar:\n", + "- [('mühendisliği' ('malzemelerin'\n", + "- [('mühendisliği' ('mekatronik'\n", + "- [('mühendisliği' ('dalıdır'\n", + "- [('mühendisliği' ('ilgilenir'\n", + "- ('mekatronik' ('malzemelerin'\n", + "\n", + "\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[22], line 61\u001b[0m\n\u001b[0;32m 59\u001b[0m \u001b[38;5;66;03m# Başlık ve anahtar kelimeleri kullanarak alt başlıkları oluşturma\u001b[39;00m\n\u001b[0;32m 60\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m title, keywords \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(titles, keywords_list):\n\u001b[1;32m---> 61\u001b[0m subheadings \u001b[38;5;241m=\u001b[39m \u001b[43mgenerate_subheadings\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtitle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeywords\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 62\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBaşlık: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtitle\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 63\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAlt Başlıklar:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[1;32mIn[22], line 34\u001b[0m, in \u001b[0;36mgenerate_subheadings\u001b[1;34m(title, keywords, top_n)\u001b[0m\n\u001b[0;32m 31\u001b[0m keyword_combinations\u001b[38;5;241m.\u001b[39mextend([\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(comb) \u001b[38;5;28;01mfor\u001b[39;00m comb \u001b[38;5;129;01min\u001b[39;00m combinations(keywords, i)])\n\u001b[0;32m 33\u001b[0m \u001b[38;5;66;03m# Kombinasyonlar için gömme vektörleri oluşturma\u001b[39;00m\n\u001b[1;32m---> 34\u001b[0m combination_embeddings \u001b[38;5;241m=\u001b[39m [get_embeddings(comb) \u001b[38;5;28;01mfor\u001b[39;00m comb \u001b[38;5;129;01min\u001b[39;00m keyword_combinations]\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Başlık ile kombinasyonlar arasındaki benzerlikleri hesaplama\u001b[39;00m\n\u001b[0;32m 37\u001b[0m similarities \u001b[38;5;241m=\u001b[39m [cosine_similarity(title_embedding\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m), comb_embedding\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m))[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mitem() \u001b[38;5;28;01mfor\u001b[39;00m comb_embedding \u001b[38;5;129;01min\u001b[39;00m combination_embeddings]\n", + "Cell \u001b[1;32mIn[22], line 34\u001b[0m, in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 31\u001b[0m keyword_combinations\u001b[38;5;241m.\u001b[39mextend([\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(comb) \u001b[38;5;28;01mfor\u001b[39;00m comb \u001b[38;5;129;01min\u001b[39;00m combinations(keywords, i)])\n\u001b[0;32m 33\u001b[0m \u001b[38;5;66;03m# Kombinasyonlar için gömme vektörleri oluşturma\u001b[39;00m\n\u001b[1;32m---> 34\u001b[0m combination_embeddings \u001b[38;5;241m=\u001b[39m [\u001b[43mget_embeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcomb\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m comb \u001b[38;5;129;01min\u001b[39;00m keyword_combinations]\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Başlık ile kombinasyonlar arasındaki benzerlikleri hesaplama\u001b[39;00m\n\u001b[0;32m 37\u001b[0m similarities \u001b[38;5;241m=\u001b[39m [cosine_similarity(title_embedding\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m), comb_embedding\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m))[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mitem() \u001b[38;5;28;01mfor\u001b[39;00m comb_embedding \u001b[38;5;129;01min\u001b[39;00m combination_embeddings]\n", + "Cell \u001b[1;32mIn[22], line 18\u001b[0m, in \u001b[0;36mget_embeddings\u001b[1;34m(text, max_length)\u001b[0m\n\u001b[0;32m 16\u001b[0m inputs \u001b[38;5;241m=\u001b[39m tokenizer(text, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m'\u001b[39m, truncation\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, max_length\u001b[38;5;241m=\u001b[39mmax_length)\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[1;32m---> 18\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs)\n\u001b[0;32m 19\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mlast_hidden_state\u001b[38;5;241m.\u001b[39mmean(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39msqueeze()\n\u001b[0;32m 20\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m embeddings\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\models\\distilbert\\modeling_distilbert.py:820\u001b[0m, in \u001b[0;36mDistilBertModel.forward\u001b[1;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 817\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attention_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 818\u001b[0m attention_mask \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mones(input_shape, device\u001b[38;5;241m=\u001b[39mdevice) \u001b[38;5;66;03m# (bs, seq_length)\u001b[39;00m\n\u001b[1;32m--> 820\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransformer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 821\u001b[0m \u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 822\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 823\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 824\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 825\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 826\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 827\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\models\\distilbert\\modeling_distilbert.py:585\u001b[0m, in \u001b[0;36mTransformer.forward\u001b[1;34m(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 577\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[0;32m 578\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[0;32m 579\u001b[0m hidden_state,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 582\u001b[0m output_attentions,\n\u001b[0;32m 583\u001b[0m )\n\u001b[0;32m 584\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 585\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 586\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_state\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 587\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 588\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 589\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 590\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 592\u001b[0m hidden_state \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\models\\distilbert\\modeling_distilbert.py:530\u001b[0m, in \u001b[0;36mTransformerBlock.forward\u001b[1;34m(self, x, attn_mask, head_mask, output_attentions)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[38;5;66;03m# Feed Forward Network\u001b[39;00m\n\u001b[0;32m 529\u001b[0m ffn_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mffn(sa_output) \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m--> 530\u001b[0m ffn_output: torch\u001b[38;5;241m.\u001b[39mTensor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput_layer_norm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mffn_output\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43msa_output\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[0;32m 532\u001b[0m output \u001b[38;5;241m=\u001b[39m (ffn_output,)\n\u001b[0;32m 533\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\modules\\normalization.py:202\u001b[0m, in \u001b[0;36mLayerNorm.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 201\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[1;32m--> 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayer_norm\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnormalized_shape\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43meps\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\nn\\functional.py:2576\u001b[0m, in \u001b[0;36mlayer_norm\u001b[1;34m(input, normalized_shape, weight, bias, eps)\u001b[0m\n\u001b[0;32m 2572\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_variadic(\u001b[38;5;28minput\u001b[39m, weight, bias):\n\u001b[0;32m 2573\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[0;32m 2574\u001b[0m layer_norm, (\u001b[38;5;28minput\u001b[39m, weight, bias), \u001b[38;5;28minput\u001b[39m, normalized_shape, weight\u001b[38;5;241m=\u001b[39mweight, bias\u001b[38;5;241m=\u001b[39mbias, eps\u001b[38;5;241m=\u001b[39meps\n\u001b[0;32m 2575\u001b[0m )\n\u001b[1;32m-> 2576\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayer_norm\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnormalized_shape\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackends\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcudnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menabled\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import pandas as pd\n", + "from transformers import AutoModel, AutoTokenizer\n", + "import torch\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from itertools import combinations\n", + "\n", + "# Model ve tokenizer'ı yükleme\n", + "model_name = 'dbmdz/distilbert-base-turkish-cased'\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModel.from_pretrained(model_name)\n", + "\n", + "def get_embeddings(text, max_length=512):\n", + " \"\"\"\n", + " Metni gömme (embedding) vektörüne dönüştürür.\n", + " \"\"\"\n", + " inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)\n", + " with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()\n", + " return embeddings\n", + "\n", + "def generate_subheadings(title, keywords, top_n=5):\n", + " \"\"\"\n", + " Başlık ve anahtar kelimeleri kullanarak alt başlıklar oluşturur.\n", + " \"\"\"\n", + " title_embedding = get_embeddings(title)\n", + " \n", + " # Anahtar kelimelerden ikili ve üçlü kombinasyonlar oluşturma\n", + " keyword_combinations = []\n", + " for i in range(2, 4): # 2 ve 3 kelimelik kombinasyonlar\n", + " keyword_combinations.extend([' '.join(comb) for comb in combinations(keywords, i)])\n", + " \n", + " # Kombinasyonlar için gömme vektörleri oluşturma\n", + " combination_embeddings = [get_embeddings(comb) for comb in keyword_combinations]\n", + "\n", + " # Başlık ile kombinasyonlar arasındaki benzerlikleri hesaplama\n", + " similarities = [cosine_similarity(title_embedding.unsqueeze(0), comb_embedding.unsqueeze(0))[0][0].item() for comb_embedding in combination_embeddings]\n", + " \n", + " # Benzerliklere göre en yüksek 5 kombinasyonu seçme\n", + " top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:top_n]\n", + " subheadings = [keyword_combinations[i] for i in top_indices]\n", + " \n", + " return subheadings\n", + "\n", + "# CSV dosyasını okuma\n", + "df = pd.read_csv('top_keywords_per_document.csv')\n", + "\n", + "# Başlıkları ve anahtar kelimeleri içeren sütun isimlerini belirleyin\n", + "title_column = 'title' # Başlıkları içeren sütun\n", + "keywords_column = 'keywords' # Anahtar kelimeleri içeren sütun\n", + "\n", + "# Başlık ve anahtar kelimeleri içeren verileri çekme\n", + "titles = df[title_column].dropna().tolist()\n", + "keywords_list = df[keywords_column].dropna().tolist()\n", + "\n", + "# Anahtar kelimeleri virgülle ayırarak listeye dönüştürme\n", + "keywords_list = [keywords.split(',') for keywords in keywords_list]\n", + "\n", + "# Başlık ve anahtar kelimeleri kullanarak alt başlıkları oluşturma\n", + "for title, keywords in zip(titles, keywords_list):\n", + " subheadings = generate_subheadings(title, keywords)\n", + " print(f\"Başlık: {title}\")\n", + " print(\"Alt Başlıklar:\")\n", + " for subheading in subheadings:\n", + " print(f\"- {subheading}\")\n", + " print(\"\\n\")\n" + ] + }, { "cell_type": "code", "execution_count": 17, @@ -114602,20 +124349,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "OSError", - "evalue": "[E050] Can't find model 'tr_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.", + "evalue": "[E050] Can't find model 'tr_core_news_lg'. It doesn't seem to be a Python package or a valid path to a data directory.", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[6], line 7\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# spaCy Türkçe dil modelini yükleme\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtr_core_news_sm\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# Metin temizleme ve yüklemleri çıkarma fonksiyonu\u001b[39;00m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mclean_and_remove_verbs\u001b[39m(text):\n\u001b[0;32m 11\u001b[0m \u001b[38;5;66;03m# Metni küçük harfe çevirme\u001b[39;00m\n", + "Cell \u001b[1;32mIn[8], line 7\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# spaCy Türkçe dil modelini yükleme\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtr_core_news_lg\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# Metin temizleme ve yüklemleri çıkarma fonksiyonu\u001b[39;00m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mclean_and_remove_verbs\u001b[39m(text):\n\u001b[0;32m 11\u001b[0m \u001b[38;5;66;03m# Metni küçük harfe çevirme\u001b[39;00m\n", "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\spacy\\__init__.py:51\u001b[0m, in \u001b[0;36mload\u001b[1;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(\n\u001b[0;32m 28\u001b[0m name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[0;32m 29\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 34\u001b[0m config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[0;32m 35\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[0;32m 36\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[0;32m 37\u001b[0m \n\u001b[0;32m 38\u001b[0m \u001b[38;5;124;03m name (str): Package name or model path.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;124;03m RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mutil\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 54\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43menable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexclude\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\spacy\\util.py:472\u001b[0m, in \u001b[0;36mload_model\u001b[1;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[0;32m 470\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[0;32m 471\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name])) \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[1;32m--> 472\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n", - "\u001b[1;31mOSError\u001b[0m: [E050] Can't find model 'tr_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory." + "\u001b[1;31mOSError\u001b[0m: [E050] Can't find model 'tr_core_news_lg'. It doesn't seem to be a Python package or a valid path to a data directory." ] } ], @@ -114625,8 +124372,13 @@ "import spacy\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", - "# spaCy Türkçe dil modelini yükleme\n", - "nlp = spacy.load('tr_core_news_sm')\n", + "# Using spacy.load().\n", + "import spacy\n", + "nlp = spacy.load(\"tr_core_news_md\")\n", + "\n", + "# Importing as module.\n", + "import tr_core_news_md\n", + "nlp = tr_core_news_md.load()\n", "\n", "# Metin temizleme ve yüklemleri çıkarma fonksiyonu\n", "def clean_and_remove_verbs(text):\n",