Aqdas's picture
Update app.py
fbd7dad verified
import streamlit as st
import pandas as pd
def get_page_num(category):
import requests
import math
import json
from time import sleep
import random
url = "https://www.daraz.pk/{}/"
params = {
"ajax": "true",
"page": "1",
"spm": "a2a0e.pdp.cate_3.7.792b39a8ijxOu0"
}
headers = {
"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url.format(category), params=params, headers=headers)
# get data of the input category
if response.status_code == 200:
data = json.loads(response.text)
sub_categories = [i['value'] for i in data['mods']['filter']['filterItems'][1]['options']]
# print(sub_categories)
sub_cat = []
total_page = []
if len(sub_categories) == 1: # if there is only one category
total_products = int(data['mods']['resultTips']['tips'].split()[0].replace(',',''))
if total_products <= 40 and total_products > 0:
page_num = 1
else:
page_num = math.floor(total_products/40)
sub_cat.append(sub_categories)
total_page.append(page_num)
else: # If there is more than one category
for cat in sub_categories:
sleep(random.random() * 3)
print(url.format(cat))
response = requests.get(url.format(cat), params=params, headers=headers)
data = json.loads(response.text)
total_products = int(data['mods']['resultTips']['tips'].split()[0].replace(',',''))
if total_products <= 40 and total_products > 0:
page_num = 1
else:
page_num = math.floor(total_products/40)
sub_cat.append(cat)
total_page.append(page_num)
return dict(zip(sub_categories, total_page))
else:
return 'request failed'
def fetch_all(total_page, category):
products_data = []
failed_fetch = []
from time import sleep
import random
import requests
import math
import json
cookie = 'lzd_cid=7f1ceb5d-2600-4a26-be64-2667cfebec91; t_uid=7f1ceb5d-2600-4a26-be64-2667cfebec91; t_fv=1702218509671; _tb_token_=f833be637387a; lzd_sid=1b68e6504b32dd3aea16bc05a3e066d6; cna=gM6uHemxBRICAW5d5vQ57lhr; _gcl_au=1.1.771646885.1702218513; _scid=8c7e8337-e396-48b9-9c3c-583c58ac2095; _fbp=fb.1.1702218515160.263289771; _tt_enable_cookie=1; _ttp=E2zZy8UNpfbfLI204FcRrfQLr-t; _bl_uid=b2l7ep2mz5FkIOyU5cC6zpLn41h5; XSRF-TOKEN=3dd5420c-4af2-42f4-9110-1d95ddeb1ce4; _scid_r=8c7e8337-e396-48b9-9c3c-583c58ac2095; _ga=GA1.2.1262335521.1702218514; _sctr=1%7C1704654000000; mi_p_source=undefined; mi_p_medium=undefined; mi_p_campaign=undefined; mi_p_term=undefined; mi_p_content=undefined; mi_p_gclid=undefined; mi_source=google; mi_medium=organic; mi_campaign=; mi_term=; mi_content=; mi_gclid=; _ga_5L4FRV3KPW=GS1.1.1704696624.4.1.1704696641.43.0.0; _ga_C6SBBPVWWK=GS1.1.1704696624.4.1.1704696641.43.0.0; lwrid=AQGNG20qIm%2B9EVBm5pjV18ZuIyxx; xlly_s=1; hng=PK|en-PK|PKR|586; curTraffic=lazada; userLanguageML=en-PK; epssw=1*i5fB11i_F1XdGEz47zEGttFFh2EnCVz4Jr8M-qEEzrPSCVeId5MyCxHMQWfcCJX4jhAWHU1CbYwQdp62dsnEY79vOxrMjCBWXxEWhv1Gx3CI3p1bjGfWhGxGDxd5HCCULOkS1bCv-3zjowtYvfNIIO1KQQJR_Sqm116A3Dmn-p3JyLB4xDmnxJsNYMDpeo2OetzRy99-xmxJXNWokfNntcSV_9UCrdM3xDDpeDmndLHB; age_limit=18Y; t_sid=tWCKXzfL5dKHJ6P9hwes1SRmzmkVjxe3; utm_channel=NA; _m_h5_tk=66b9f7f529c772a8433e5669e94795f7_1705656645823; _m_h5_tk_enc=db2346c0c8701b95f3e9fa73a1bb8829; daraz-marketing-tracker=hide; JSESSIONID=B7914CD57669FACF1D3CEC5DD16F1DCF; tfstk=e_Gv0zGWQ94fh9SUEIpo_4pikl8kEj32ymuCslqcC0n-R2S0ChJ4C53YqlV0Gn8tX2ojoRcmlfCt2uB0Ih84CGinXPqj6V5T5ViwulY2Equ4_5tHxpAnuq52kcP9tDchLeN6xHAu8Gx505MmRKLAxzLvenxAtRGUlpNCDv9s6jUYHk6021C9SreYAqZR4glJ_-EIluC14Whn9p31xkUGhULRQOybzYLe2xK_aqCLykYkrOWaemU8xULRQOybzzEHz0WNQ8nA.; l=fBIMqdfPPaW2mUTtBOfanurza77OSIRvXuPzaNbMi9fP_-Cp5-FCW1INsGL9C3HNFswHR3kC9mckBeYBYIXN3PYinmnv_CkmnXr9aX5..; isg=BNzcaeHm1UWTzKGUhk2R8EJyrfOOVYB_YmiEt7bd6EeoAXyL3mVQD1KzYHH5wrjX'
session = requests.Session()
for number in range(1, total_page):
st.write(f"finished page {number}")
sleep(random.random() * 5)
url = f"https://www.daraz.pk/{category}/"
params = {
"ajax": "true",
"page": str(number),
"spm": "a2a0e.searchlistcategory.pagination.5.7d725753tvFnfz"
}
headers = {
"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Cookie": cookie + str(session.cookies.get_dict())
}
request = session.get(url, headers=headers, params=params, allow_redirects=True)
if request.status_code == 200:
# Sometimes it failed on json load, this try except can handle this problem
try:
data = json.loads(request.text)
if 'listItems' in data['mods'].keys():
products_data.append(data)
else:
failed_fetch.append(f'Data could not fetch from page {number}')
except:
failed_fetch.append(f'json failed at page {number}')
continue
else:
failed_fetch.append(f'Request failed of page {number}')
return products_data, failed_fetch
def main_scrap(main_category):
page_nums_dict = get_page_num(main_category) # get all sub-categories and there total page loop number
extract_data = {} # a dict for saving all sub categories data
failed_data ={} # a dict for storing failed to get data
for category, page_num in page_nums_dict.items(): # loop through sub categories
category_data, failed_scrap = fetch_all(page_num, category) # get first category data
extract_data[category] = category_data # create a key with category name and save all list of list structure data
failed_data[category] = failed_scrap # same
st.write('Finish Scrap Successfully')
# when the loop compeletes
return extract_data, failed_data
def get_relevant(list_40):
# Gets data of a 40 product list structure
data_dict = {
'name': [i.get('name', 'None') for i in list_40['mods']['listItems']],
'current_price': [i.get('utLogMap', 'None').get('current_price', 'None') for i in list_40['mods']['listItems']],
'original_price': [i.get('utLogMap', 'None').get('originalPrice', 'None') for i in list_40['mods']['listItems']],
'x_ad': [i.get('utLogMap', 'None').get('x_ad', 'None') for i in list_40['mods']['listItems']],
'voucherIds': [i.get('utLogMap', 'None').get('voucherIds', 'None') for i in list_40['mods']['listItems']],
'productUrl': [i.get('productUrl', 'None') for i in list_40['mods']['listItems']],
'ratingScore': [i.get('ratingScore', 'None') for i in list_40['mods']['listItems']],
'review': [i.get('review', 'None') for i in list_40['mods']['listItems']],
'location': [i.get('location', 'None') for i in list_40['mods']['listItems']],
'sku': [i.get('sku', 'None') for i in list_40['mods']['listItems']],
'description': [i.get('description', 'None') for i in list_40['mods']['listItems']],
'brandName': [i.get('brandName', 'None') for i in list_40['mods']['listItems']],
'sellerId': [i.get('sellerId', 'None') for i in list_40['mods']['listItems']],
'sellerName': [i.get('sellerName', 'None') for i in list_40['mods']['listItems']],
'image': [i.get('image', 'None') for i in list_40['mods']['listItems']],
'itemSold' : [i.get('soldInfo',{'soldNum':None}).get('soldNum','None') for i in list_40['mods']['listItems']],
'isAD' : [i.get('isAD','None') for i in list_40['mods']['listItems']],
'inStock': [i.get('inStock','None') for i in list_40['mods']['listItems']],
}
return data_dict
def category_info(category_data):
# get data of a single category that is split into list 40
all_products = {
'image': [],
'name': [],
'current_price': [],
'original_price': [],
'ratingScore': [],
'review': [],
'productUrl': [],
'sellerName': [],
'itemSold' : [],
'inStock': [],
'isAD' : [],
'x_ad': [],
'voucherIds': [],
'location': [],
'sku': [],
'description': [],
'brandName': [],
'sellerId': [],
}
for products_40 in category_data:
return_dic = get_relevant(products_40)
all_products['image'].extend(return_dic['image'])
all_products['name'].extend(return_dic['name'])
all_products['sellerName'].extend(return_dic['sellerName'])
all_products['review'].extend(return_dic['review'])
all_products['ratingScore'].extend(return_dic['ratingScore'])
all_products['itemSold'].extend(return_dic['itemSold'])
all_products['inStock'].extend(return_dic['inStock'])
all_products['isAD'].extend(return_dic['isAD'])
all_products['location'].extend(return_dic['location'])
all_products['brandName'].extend(return_dic['brandName'])
all_products['current_price'].extend(return_dic['current_price'])
all_products['original_price'].extend(return_dic['original_price'])
all_products['productUrl'].extend(return_dic['productUrl'])
all_products['description'].extend(return_dic['description'])
all_products['x_ad'].extend(return_dic['x_ad'])
all_products['voucherIds'].extend(return_dic['voucherIds'])
all_products['sku'].extend(return_dic['sku'])
all_products['sellerId'].extend(return_dic['sellerId'])
return all_products
def extract_clean(main_category_name):
category_combine = {
'image': [],
'name': [],
'current_price': [],
'original_price': [],
'ratingScore': [],
'review': [],
'productUrl': [],
'sellerName': [],
'itemSold' : [],
'inStock': [],
'isAD' : [],
'x_ad': [],
'voucherIds': [],
'location': [],
'sku': [],
'description': [],
'brandName': [],
'sellerId': [],
}
extract_data, failed_data = main_scrap(main_category_name) # extract_data is a dict, contains all the sub-cat data
for c in extract_data.keys():
dict_cat = category_info(extract_data[c]) # return a dict where all the split 40 combines column: list_of_value
for cat_keys in dict_cat.keys():
category_combine[cat_keys].extend(dict_cat[cat_keys])
import pandas as pd
df = pd.DataFrame(category_combine)
df = df.astype(str)
df['productUrl'] = df['productUrl'].str.strip('//')
df.description = df.description.str.strip('[]').str.strip("'")
return df, extract_data.keys()
# Streamlit app
def main():
df = pd.DataFrame({
})
st.title('Products Data Scrapping > Daraz.pk')
category = st.text_input('Please Input the category name')
if st.button('Start Scrap'):
df, categories = extract_clean(category)
if not df.empty:
st.dataframe(df)
if __name__ == "__main__":
main()