Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
def get_page_num(category): | |
import requests | |
import math | |
import json | |
from time import sleep | |
import random | |
url = "https://www.daraz.pk/{}/" | |
params = { | |
"ajax": "true", | |
"page": "1", | |
"spm": "a2a0e.pdp.cate_3.7.792b39a8ijxOu0" | |
} | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
} | |
response = requests.get(url.format(category), params=params, headers=headers) | |
# get data of the input category | |
if response.status_code == 200: | |
data = json.loads(response.text) | |
sub_categories = [i['value'] for i in data['mods']['filter']['filterItems'][1]['options']] | |
# print(sub_categories) | |
sub_cat = [] | |
total_page = [] | |
if len(sub_categories) == 1: # if there is only one category | |
total_products = int(data['mods']['resultTips']['tips'].split()[0].replace(',','')) | |
if total_products <= 40 and total_products > 0: | |
page_num = 1 | |
else: | |
page_num = math.floor(total_products/40) | |
sub_cat.append(sub_categories) | |
total_page.append(page_num) | |
else: # If there is more than one category | |
for cat in sub_categories: | |
sleep(random.random() * 3) | |
print(url.format(cat)) | |
response = requests.get(url.format(cat), params=params, headers=headers) | |
data = json.loads(response.text) | |
total_products = int(data['mods']['resultTips']['tips'].split()[0].replace(',','')) | |
if total_products <= 40 and total_products > 0: | |
page_num = 1 | |
else: | |
page_num = math.floor(total_products/40) | |
sub_cat.append(cat) | |
total_page.append(page_num) | |
return dict(zip(sub_categories, total_page)) | |
else: | |
return 'request failed' | |
def fetch_all(total_page, category): | |
products_data = [] | |
failed_fetch = [] | |
from time import sleep | |
import random | |
import requests | |
import math | |
import json | |
cookie = 'lzd_cid=7f1ceb5d-2600-4a26-be64-2667cfebec91; t_uid=7f1ceb5d-2600-4a26-be64-2667cfebec91; t_fv=1702218509671; _tb_token_=f833be637387a; lzd_sid=1b68e6504b32dd3aea16bc05a3e066d6; cna=gM6uHemxBRICAW5d5vQ57lhr; _gcl_au=1.1.771646885.1702218513; _scid=8c7e8337-e396-48b9-9c3c-583c58ac2095; _fbp=fb.1.1702218515160.263289771; _tt_enable_cookie=1; _ttp=E2zZy8UNpfbfLI204FcRrfQLr-t; _bl_uid=b2l7ep2mz5FkIOyU5cC6zpLn41h5; XSRF-TOKEN=3dd5420c-4af2-42f4-9110-1d95ddeb1ce4; _scid_r=8c7e8337-e396-48b9-9c3c-583c58ac2095; _ga=GA1.2.1262335521.1702218514; _sctr=1%7C1704654000000; mi_p_source=undefined; mi_p_medium=undefined; mi_p_campaign=undefined; mi_p_term=undefined; mi_p_content=undefined; mi_p_gclid=undefined; mi_source=google; mi_medium=organic; mi_campaign=; mi_term=; mi_content=; mi_gclid=; _ga_5L4FRV3KPW=GS1.1.1704696624.4.1.1704696641.43.0.0; _ga_C6SBBPVWWK=GS1.1.1704696624.4.1.1704696641.43.0.0; lwrid=AQGNG20qIm%2B9EVBm5pjV18ZuIyxx; xlly_s=1; hng=PK|en-PK|PKR|586; curTraffic=lazada; userLanguageML=en-PK; epssw=1*i5fB11i_F1XdGEz47zEGttFFh2EnCVz4Jr8M-qEEzrPSCVeId5MyCxHMQWfcCJX4jhAWHU1CbYwQdp62dsnEY79vOxrMjCBWXxEWhv1Gx3CI3p1bjGfWhGxGDxd5HCCULOkS1bCv-3zjowtYvfNIIO1KQQJR_Sqm116A3Dmn-p3JyLB4xDmnxJsNYMDpeo2OetzRy99-xmxJXNWokfNntcSV_9UCrdM3xDDpeDmndLHB; age_limit=18Y; t_sid=tWCKXzfL5dKHJ6P9hwes1SRmzmkVjxe3; utm_channel=NA; _m_h5_tk=66b9f7f529c772a8433e5669e94795f7_1705656645823; _m_h5_tk_enc=db2346c0c8701b95f3e9fa73a1bb8829; daraz-marketing-tracker=hide; JSESSIONID=B7914CD57669FACF1D3CEC5DD16F1DCF; tfstk=e_Gv0zGWQ94fh9SUEIpo_4pikl8kEj32ymuCslqcC0n-R2S0ChJ4C53YqlV0Gn8tX2ojoRcmlfCt2uB0Ih84CGinXPqj6V5T5ViwulY2Equ4_5tHxpAnuq52kcP9tDchLeN6xHAu8Gx505MmRKLAxzLvenxAtRGUlpNCDv9s6jUYHk6021C9SreYAqZR4glJ_-EIluC14Whn9p31xkUGhULRQOybzYLe2xK_aqCLykYkrOWaemU8xULRQOybzzEHz0WNQ8nA.; l=fBIMqdfPPaW2mUTtBOfanurza77OSIRvXuPzaNbMi9fP_-Cp5-FCW1INsGL9C3HNFswHR3kC9mckBeYBYIXN3PYinmnv_CkmnXr9aX5..; isg=BNzcaeHm1UWTzKGUhk2R8EJyrfOOVYB_YmiEt7bd6EeoAXyL3mVQD1KzYHH5wrjX' | |
session = requests.Session() | |
for number in range(1, total_page): | |
st.write(f"finished page {number}") | |
sleep(random.random() * 5) | |
url = f"https://www.daraz.pk/{category}/" | |
params = { | |
"ajax": "true", | |
"page": str(number), | |
"spm": "a2a0e.searchlistcategory.pagination.5.7d725753tvFnfz" | |
} | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
"Cookie": cookie + str(session.cookies.get_dict()) | |
} | |
request = session.get(url, headers=headers, params=params, allow_redirects=True) | |
if request.status_code == 200: | |
# Sometimes it failed on json load, this try except can handle this problem | |
try: | |
data = json.loads(request.text) | |
if 'listItems' in data['mods'].keys(): | |
products_data.append(data) | |
else: | |
failed_fetch.append(f'Data could not fetch from page {number}') | |
except: | |
failed_fetch.append(f'json failed at page {number}') | |
continue | |
else: | |
failed_fetch.append(f'Request failed of page {number}') | |
return products_data, failed_fetch | |
def main_scrap(main_category): | |
page_nums_dict = get_page_num(main_category) # get all sub-categories and there total page loop number | |
extract_data = {} # a dict for saving all sub categories data | |
failed_data ={} # a dict for storing failed to get data | |
for category, page_num in page_nums_dict.items(): # loop through sub categories | |
category_data, failed_scrap = fetch_all(page_num, category) # get first category data | |
extract_data[category] = category_data # create a key with category name and save all list of list structure data | |
failed_data[category] = failed_scrap # same | |
st.write('Finish Scrap Successfully') | |
# when the loop compeletes | |
return extract_data, failed_data | |
def get_relevant(list_40): | |
# Gets data of a 40 product list structure | |
data_dict = { | |
'name': [i.get('name', 'None') for i in list_40['mods']['listItems']], | |
'current_price': [i.get('utLogMap', 'None').get('current_price', 'None') for i in list_40['mods']['listItems']], | |
'original_price': [i.get('utLogMap', 'None').get('originalPrice', 'None') for i in list_40['mods']['listItems']], | |
'x_ad': [i.get('utLogMap', 'None').get('x_ad', 'None') for i in list_40['mods']['listItems']], | |
'voucherIds': [i.get('utLogMap', 'None').get('voucherIds', 'None') for i in list_40['mods']['listItems']], | |
'productUrl': [i.get('productUrl', 'None') for i in list_40['mods']['listItems']], | |
'ratingScore': [i.get('ratingScore', 'None') for i in list_40['mods']['listItems']], | |
'review': [i.get('review', 'None') for i in list_40['mods']['listItems']], | |
'location': [i.get('location', 'None') for i in list_40['mods']['listItems']], | |
'sku': [i.get('sku', 'None') for i in list_40['mods']['listItems']], | |
'description': [i.get('description', 'None') for i in list_40['mods']['listItems']], | |
'brandName': [i.get('brandName', 'None') for i in list_40['mods']['listItems']], | |
'sellerId': [i.get('sellerId', 'None') for i in list_40['mods']['listItems']], | |
'sellerName': [i.get('sellerName', 'None') for i in list_40['mods']['listItems']], | |
'image': [i.get('image', 'None') for i in list_40['mods']['listItems']], | |
'itemSold' : [i.get('soldInfo',{'soldNum':None}).get('soldNum','None') for i in list_40['mods']['listItems']], | |
'isAD' : [i.get('isAD','None') for i in list_40['mods']['listItems']], | |
'inStock': [i.get('inStock','None') for i in list_40['mods']['listItems']], | |
} | |
return data_dict | |
def category_info(category_data): | |
# get data of a single category that is split into list 40 | |
all_products = { | |
'image': [], | |
'name': [], | |
'current_price': [], | |
'original_price': [], | |
'ratingScore': [], | |
'review': [], | |
'productUrl': [], | |
'sellerName': [], | |
'itemSold' : [], | |
'inStock': [], | |
'isAD' : [], | |
'x_ad': [], | |
'voucherIds': [], | |
'location': [], | |
'sku': [], | |
'description': [], | |
'brandName': [], | |
'sellerId': [], | |
} | |
for products_40 in category_data: | |
return_dic = get_relevant(products_40) | |
all_products['image'].extend(return_dic['image']) | |
all_products['name'].extend(return_dic['name']) | |
all_products['sellerName'].extend(return_dic['sellerName']) | |
all_products['review'].extend(return_dic['review']) | |
all_products['ratingScore'].extend(return_dic['ratingScore']) | |
all_products['itemSold'].extend(return_dic['itemSold']) | |
all_products['inStock'].extend(return_dic['inStock']) | |
all_products['isAD'].extend(return_dic['isAD']) | |
all_products['location'].extend(return_dic['location']) | |
all_products['brandName'].extend(return_dic['brandName']) | |
all_products['current_price'].extend(return_dic['current_price']) | |
all_products['original_price'].extend(return_dic['original_price']) | |
all_products['productUrl'].extend(return_dic['productUrl']) | |
all_products['description'].extend(return_dic['description']) | |
all_products['x_ad'].extend(return_dic['x_ad']) | |
all_products['voucherIds'].extend(return_dic['voucherIds']) | |
all_products['sku'].extend(return_dic['sku']) | |
all_products['sellerId'].extend(return_dic['sellerId']) | |
return all_products | |
def extract_clean(main_category_name): | |
category_combine = { | |
'image': [], | |
'name': [], | |
'current_price': [], | |
'original_price': [], | |
'ratingScore': [], | |
'review': [], | |
'productUrl': [], | |
'sellerName': [], | |
'itemSold' : [], | |
'inStock': [], | |
'isAD' : [], | |
'x_ad': [], | |
'voucherIds': [], | |
'location': [], | |
'sku': [], | |
'description': [], | |
'brandName': [], | |
'sellerId': [], | |
} | |
extract_data, failed_data = main_scrap(main_category_name) # extract_data is a dict, contains all the sub-cat data | |
for c in extract_data.keys(): | |
dict_cat = category_info(extract_data[c]) # return a dict where all the split 40 combines column: list_of_value | |
for cat_keys in dict_cat.keys(): | |
category_combine[cat_keys].extend(dict_cat[cat_keys]) | |
import pandas as pd | |
df = pd.DataFrame(category_combine) | |
df = df.astype(str) | |
df['productUrl'] = df['productUrl'].str.strip('//') | |
df.description = df.description.str.strip('[]').str.strip("'") | |
return df, extract_data.keys() | |
# Streamlit app | |
def main(): | |
df = pd.DataFrame({ | |
}) | |
st.title('Products Data Scrapping > Daraz.pk') | |
category = st.text_input('Please Input the category name') | |
if st.button('Start Scrap'): | |
df, categories = extract_clean(category) | |
if not df.empty: | |
st.dataframe(df) | |
if __name__ == "__main__": | |
main() | |