text_mining_21C11027 / 21c11027.py
hvtham's picture
update SECRET_TOKEN
63fc868 verified
raw
history blame contribute delete
No virus
1.97 kB
# -*- coding: utf-8 -*-
"""21C11027.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1z_jG4sUgsIhZRoikoXxYMHNAMpiYlWAW
**KHAI THÁC NGỮ LIỆU VĂN BẢN NÂNG CAO**
* **Họ và tên:** Huỳnh Viết Thám
* **Mã số học viên:** 21C11027
# Cài đặt thư viện cần thiết
"""
!pip install beautifulsoup4
!pip install google
!pip install google-search-results
publication_name = input("Please input the keyword for searching: ")
from serpapi import GoogleSearch
#SECRET_TOKEN: get key in serpAPI
def checkPaper(publication_name):
params = {
"api_key": "SECRET_TOKEN",
"engine": "google",
"q": "*",
"location": "Austin, Texas, United States",
"google_domain": "google.com",
"gl": "us",
"hl": "en",
"as_sitesearch": "github.com"
}
params["q"] = publication_name
search = GoogleSearch(params)
results = search.get_dict()
for result in results["organic_results"]:
print(f"Title: {result['title']}\nSummary: {result['snippet']}\nLink: {result['link']}\n")
#get top 3:
top3_result=results["organic_results"][0:3]
has_github = False
backup_link = None
threshold = 0.5 #total number keyword in snippet >= 50% -> ok
for result in top3_result:
word_list = publication_name.split(' ')
len_word_list = len(word_list)
count = 0
if "https://github.com/" in result['link']:
for word in word_list:
if word in result['snippet']:
count+=1
if count >= count/len_word_list:
has_github = True
backup_link = result['link']
break
if has_github == False:
return "This paper doesn't have source code in github!"
else:
return "This paper has source code in github!\n" + backup_link
!pip install gradio
import gradio as gr
def greet(name):
return "Hello " + name + "!"
demo = gr.Interface(fn=checkPaper, inputs="text", outputs="text")
demo.launch()