{ "cells": [ { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "import subprocess\n", "import spacy\n", "from sumy.parsers.plaintext import PlaintextParser\n", "from sumy.nlp.tokenizers import Tokenizer" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting en-core-web-sm==3.5.0\n", " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)\n", "\u001b[2K ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 3.9 MB/s eta 0:00:00\n", "\u001b[?25hRequirement already satisfied: spacy<3.6.0,>=3.5.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from en-core-web-sm==3.5.0) (3.5.3)\n", "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.12)\n", "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.4)\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.7)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.6)\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.6)\n", "Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.10)\n", "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.9.1)\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.4.6)\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.7)\n", "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.4.1)\n", "Requirement already satisfied: pathy>=0.10.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.10.1)\n", "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (5.2.1)\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.65.0)\n", "Requirement already satisfied: numpy>=1.15.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.24.3)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.29.0)\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.10.8)\n", "Requirement already satisfied: jinja2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.1.2)\n", "Requirement already satisfied: setuptools in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (67.8.0)\n", "Requirement already satisfied: packaging>=20.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (23.0)\n", "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.3.0)\n", "Requirement already satisfied: typing-extensions>=4.2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.7.1)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.4)\n", "Requirement already satisfied: idna<4,>=2.5 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.4)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.26.16)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2023.7.22)\n", "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.9)\n", "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.0.4)\n", "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.0.4)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /home/sssingh/miniconda3/envs/nlp/lib/python3.11/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.1.1)\n", "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the package via spacy.load('en_core_web_sm')\n" ] }, { "data": { "text/plain": [ "CompletedProcess(args=['python', '-m', 'spacy', 'download', 'en_core_web_sm'], returncode=0)" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "subprocess.run([\"python\", \"-m\", \"spacy\", \"download\", \"en_core_web_sm\"])" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "This is an example text in Singapore by Sunil Singh on 6th August 2023" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp = spacy.load('en_core_web_sm')\n", "doc = nlp(\"This is an example text in Singapore by Sunil Singh on 6th August 2023\")\n", "doc" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n\\n \\n displaCy\\n \\n\\n \\n
\\n
This is an example text in \\n\\n Singapore\\n GPE\\n\\n by \\n\\n Sunil Singh\\n GPE\\n\\n on \\n\\n 6th August 2023\\n DATE\\n\\n
\\n
\\n\\n'" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner_html = displacy.render(docs=doc, style=\"ent\", jupyter=False, page=True)\n", "ner_html" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Entity CodeEntity Description
0DATEAbsolute or relative dates or periods
1GPECountries, cities, states
\n", "
" ], "text/plain": [ " Entity Code Entity Description\n", "0 DATE Absolute or relative dates or periods\n", "1 GPE Countries, cities, states" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "label, desc = [],[]\n", "for ent in doc.ents:\n", " label.append(ent.label_)\n", " desc.append(spacy.explain(ent.label_))\n", "label, desc = list(set(label)), list(set(desc))\n", "df = pd.DataFrame(data={\"Entity Code\":label, \"Entity Description\":desc})\n", "df" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(Singapore, Sunil Singh, 6th August 2023)" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc.ents" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This PRON pronoun\n", "is AUX auxiliary\n", "an DET determiner\n", "example NOUN noun\n", "text NOUN noun\n", "in ADP adposition\n", "Singapore PROPN proper noun\n", "by ADP adposition\n", "Sunil PROPN proper noun\n", "Singh PROPN proper noun\n", "on ADP adposition\n", "6th ADJ adjective\n", "August PROPN proper noun\n", "2023 NUM numeral\n" ] } ], "source": [ "for token in doc:\n", " print(token.text, token.pos_, spacy.explain(token.pos_))" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TokenTagPosDescription
0exampleNNNOUNnoun, singular or mass
1textNNNOUNnoun, singular or mass
2SingaporeNNPPROPNnoun, proper singular
3SunilNNPPROPNnoun, proper singular
4SinghNNPPROPNnoun, proper singular
56thJJADJadjective (English), other noun-modifier (Chin...
6AugustNNPPROPNnoun, proper singular
72023CDNUMcardinal number
\n", "
" ], "text/plain": [ " Token Tag Pos Description\n", "0 example NN NOUN noun, singular or mass\n", "1 text NN NOUN noun, singular or mass\n", "2 Singapore NNP PROPN noun, proper singular\n", "3 Sunil NNP PROPN noun, proper singular\n", "4 Singh NNP PROPN noun, proper singular\n", "5 6th JJ ADJ adjective (English), other noun-modifier (Chin...\n", "6 August NNP PROPN noun, proper singular\n", "7 2023 CD NUM cardinal number" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word, tag, pos, desc = [], [], [], []\n", "for token in doc:\n", " if token.is_stop or token.is_punct:\n", " continue\n", " word.append(str(token))\n", " tag.append(str(token.tag_))\n", " pos.append(token.pos_)\n", " desc.append(spacy.explain(token.tag_))\n", "pd.DataFrame(data=dict(Token=word, Tag=tag, Pos=pos, Description=desc))\n" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "import sys\n", "from sumy.parsers.plaintext import PlaintextParser\n", "from sumy.nlp.tokenizers import Tokenizer\n", "from sumy.summarizers.text_rank import TextRankSummarizer\n", "from sumy.summarizers.lex_rank import LexRankSummarizer\n", "from sumy.summarizers.lsa import LsaSummarizer\n", "from dataclasses import dataclass\n", "@dataclass\n", "class __AppConfig:\n", " \"\"\"app-wide configurations\"\"\"\n", " summarizers = dict(\n", " TextRankSummarizer=\"sumy.summarizers.text_rank\",\n", " LexRankSummarizer=\"sumy.summarizers.lex_rank\",\n", " LsaSummarizer=\"sumy.summarizers.lsa\",\n", " )\n", "### make configs available to any module that imports this module\n", "app_config = __AppConfig()" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "def class_from_name(module, class_name):\n", " return getattr(module, class_name)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [], "source": [ "method=\"TextRankSummarizer\"\n", "def get_summarizer(method):\n", " module=sys.modules[app_config.summarizers.get(method)]\n", " summarizer = class_from_name(module, method)\n", " return summarizer" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "text = \"\"\"Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.\n", "Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. Interstellar uses extensive practical and miniature effects, and the company Double Negative created additional digital effects.\"\"\"" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parser = PlaintextParser.from_string(text, Tokenizer(\"english\"))\n", "parser" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parser.document.sentences" ] } ], "metadata": { "kernelspec": { "display_name": "nlp", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }