{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'bs4'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[2], line 6\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n\u001b[1;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mbs4\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BeautifulSoup\n", "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'bs4'" ] } ], "source": [ "import csv\n", "import pandas as pd \n", "from pymongo import MongoClient\n", "\n", "import requests\n", "from bs4 import BeautifulSoup\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Connect to MongoDB\n", "client = MongoClient(\"mongodb://localhost:27017/\")\n", "db = client[\"myDatabase\"]\n", "source_collection = db[\"data\"]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Export translated data to a CSV file #bu dosyayı json olarak indirdim\n", "\"\"\"yeni_data = list(source_collection.find())\n", "print(yeni_data)\"\"\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " _id title \\\n", "0 {'$oid': '66a1020f29abc84d21689044'} Mental Note Vol. 24 \n", "1 {'$oid': '66a1020f29abc84d21689045'} Your Brain On Coronavirus \n", "2 {'$oid': '66a1020f29abc84d21689046'} Mind Your Nose \n", "3 {'$oid': '66a1020f29abc84d21689047'} The 4 Purposes of Dreams \n", "4 {'$oid': '66a1020f29abc84d21689048'} Surviving a Rod Through the Head \n", "\n", " url authors \\\n", "0 https://medium.com/invisible-illness/mental-no... ['Ryan Fan'] \n", "1 https://medium.com/age-of-awareness/how-the-pa... ['Simon Spichak'] \n", "2 https://medium.com/neodotlife/mind-your-nose-f... [] \n", "3 https://medium.com/science-for-real/the-4-purp... ['Eshan Samaranayake'] \n", "4 https://medium.com/live-your-life-on-purpose/s... ['Rishav Sinha'] \n", "\n", " timestamp \\\n", "0 2020-12-26 03:38:10.479000+00:00 \n", "1 2020-09-23 22:10:17.126000+00:00 \n", "2 2020-10-10 20:17:37.132000+00:00 \n", "3 2020-12-21 16:05:19.524000+00:00 \n", "4 2020-02-26 00:01:01.576000+00:00 \n", "\n", " tags \n", "0 ['Mental Health', 'Health', 'Psychology', 'Sci... \n", "1 ['Mental Health', 'Coronavirus', 'Science', 'P... \n", "2 ['Biotechnology', 'Neuroscience', 'Brain', 'We... \n", "3 ['Health', 'Neuroscience', 'Mental Health', 'P... \n", "4 ['Brain', 'Health', 'Development', 'Psychology... \n" ] } ], "source": [ "#csv dosyası olarak yüklenmesi\n", "df=pd.read_json('myDatabase.data.json')\n", "print(df.head())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
_idtitleurlauthorstimestamptags
0{'$oid': '66a1020f29abc84d21689044'}Mental Note Vol. 24https://medium.com/invisible-illness/mental-no...['Ryan Fan']2020-12-26 03:38:10.479000+00:00['Mental Health', 'Health', 'Psychology', 'Sci...
1{'$oid': '66a1020f29abc84d21689045'}Your Brain On Coronavirushttps://medium.com/age-of-awareness/how-the-pa...['Simon Spichak']2020-09-23 22:10:17.126000+00:00['Mental Health', 'Coronavirus', 'Science', 'P...
2{'$oid': '66a1020f29abc84d21689046'}Mind Your Nosehttps://medium.com/neodotlife/mind-your-nose-f...[]2020-10-10 20:17:37.132000+00:00['Biotechnology', 'Neuroscience', 'Brain', 'We...
3{'$oid': '66a1020f29abc84d21689047'}The 4 Purposes of Dreamshttps://medium.com/science-for-real/the-4-purp...['Eshan Samaranayake']2020-12-21 16:05:19.524000+00:00['Health', 'Neuroscience', 'Mental Health', 'P...
4{'$oid': '66a1020f29abc84d21689048'}Surviving a Rod Through the Headhttps://medium.com/live-your-life-on-purpose/s...['Rishav Sinha']2020-02-26 00:01:01.576000+00:00['Brain', 'Health', 'Development', 'Psychology...
\n", "
" ], "text/plain": [ " _id title \\\n", "0 {'$oid': '66a1020f29abc84d21689044'} Mental Note Vol. 24 \n", "1 {'$oid': '66a1020f29abc84d21689045'} Your Brain On Coronavirus \n", "2 {'$oid': '66a1020f29abc84d21689046'} Mind Your Nose \n", "3 {'$oid': '66a1020f29abc84d21689047'} The 4 Purposes of Dreams \n", "4 {'$oid': '66a1020f29abc84d21689048'} Surviving a Rod Through the Head \n", "\n", " url authors \\\n", "0 https://medium.com/invisible-illness/mental-no... ['Ryan Fan'] \n", "1 https://medium.com/age-of-awareness/how-the-pa... ['Simon Spichak'] \n", "2 https://medium.com/neodotlife/mind-your-nose-f... [] \n", "3 https://medium.com/science-for-real/the-4-purp... ['Eshan Samaranayake'] \n", "4 https://medium.com/live-your-life-on-purpose/s... ['Rishav Sinha'] \n", "\n", " timestamp \\\n", "0 2020-12-26 03:38:10.479000+00:00 \n", "1 2020-09-23 22:10:17.126000+00:00 \n", "2 2020-10-10 20:17:37.132000+00:00 \n", "3 2020-12-21 16:05:19.524000+00:00 \n", "4 2020-02-26 00:01:01.576000+00:00 \n", "\n", " tags \n", "0 ['Mental Health', 'Health', 'Psychology', 'Sci... \n", "1 ['Mental Health', 'Coronavirus', 'Science', 'P... \n", "2 ['Biotechnology', 'Neuroscience', 'Brain', 'We... \n", "3 ['Health', 'Neuroscience', 'Mental Health', 'P... \n", "4 ['Brain', 'Health', 'Development', 'Psychology... " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "_id object\n", "title object\n", "url object\n", "authors object\n", "timestamp object\n", "tags object\n", "dtype: object" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 \n", "Name: _id, dtype: object\n", "0 \n", "Name: title, dtype: object\n", "0 \n", "Name: url, dtype: object\n", "0 \n", "Name: authors, dtype: object\n", "0 \n", "Name: timestamp, dtype: object\n", "0 \n", "Name: tags, dtype: object\n" ] } ], "source": [ "for i in df.columns:\n", " print(df[i].apply(lambda x:type(x)).head(1))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#içeriklerin saklanacağı bir liste oluştrun\n", "contents=[]\n", "#her url için içeriği çekin \n", "\n", "for url in df['url']:\n", " try:\n", " response=requests.get(url)\n", " soup=BeautifulSoup(response.content,'html.parser')\n", "\n", " #medium içeriğini çekmek için uygun seçiciyi kullanın\n", " article_content=soup.find('articles')\n", " content=article_content.get_text(separator='') if article_content else 'content not found'\n", "\n", " contents.append(content)\n", " except Exception as e:\n", " contents.append(f'error retrieving content: {e}')\n", "\n", "#içerikleri veri çerçevesine ekleyin.\n", "df['content']= contents\n", "\n", "#yeni veri kümesini kontrol edin\n", "print(df.head())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "#modeleğitimi için test valid değerleriğ oluşturma \n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_val, y_train, y_val = train_test_split(translated_data, translated_data, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.svm import SVC\n", "\n", "vectorizer = TfidfVectorizer()\n", "X_train_transformed = vectorizer.fit_transform(X_train)\n", "X_val_transformed = vectorizer.transform(X_val)\n", "\n", "model = SVC()\n", "model.fit(X_train_transformed, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", "y_pred = model.predict(X_val_transformed)\n", "accuracy = accuracy_score(y_val, y_pred)\n", "print(f\"Accuracy: {accuracy:.2f}\")" ] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }