import streamlit as st import pandas as pd def get_page_num(category): import requests import math import json from time import sleep import random url = "https://www.daraz.pk/{}/" params = { "ajax": "true", "page": "1", "spm": "a2a0e.pdp.cate_3.7.792b39a8ijxOu0" } headers = { "User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } response = requests.get(url.format(category), params=params, headers=headers) # get data of the input category if response.status_code == 200: data = json.loads(response.text) sub_categories = [i['value'] for i in data['mods']['filter']['filterItems'][1]['options']] # print(sub_categories) sub_cat = [] total_page = [] if len(sub_categories) == 1: # if there is only one category total_products = int(data['mods']['resultTips']['tips'].split()[0].replace(',','')) if total_products <= 40 and total_products > 0: page_num = 1 else: page_num = math.floor(total_products/40) sub_cat.append(sub_categories) total_page.append(page_num) else: # If there is more than one category for cat in sub_categories: sleep(random.random() * 3) print(url.format(cat)) response = requests.get(url.format(cat), params=params, headers=headers) data = json.loads(response.text) total_products = int(data['mods']['resultTips']['tips'].split()[0].replace(',','')) if total_products <= 40 and total_products > 0: page_num = 1 else: page_num = math.floor(total_products/40) sub_cat.append(cat) total_page.append(page_num) return dict(zip(sub_categories, total_page)) else: return 'request failed' def fetch_all(total_page, category): products_data = [] failed_fetch = [] from time import sleep import random import requests import math import json cookie = 'lzd_cid=7f1ceb5d-2600-4a26-be64-2667cfebec91; t_uid=7f1ceb5d-2600-4a26-be64-2667cfebec91; t_fv=1702218509671; _tb_token_=f833be637387a; lzd_sid=1b68e6504b32dd3aea16bc05a3e066d6; cna=gM6uHemxBRICAW5d5vQ57lhr; _gcl_au=1.1.771646885.1702218513; _scid=8c7e8337-e396-48b9-9c3c-583c58ac2095; _fbp=fb.1.1702218515160.263289771; _tt_enable_cookie=1; _ttp=E2zZy8UNpfbfLI204FcRrfQLr-t; _bl_uid=b2l7ep2mz5FkIOyU5cC6zpLn41h5; XSRF-TOKEN=3dd5420c-4af2-42f4-9110-1d95ddeb1ce4; _scid_r=8c7e8337-e396-48b9-9c3c-583c58ac2095; _ga=GA1.2.1262335521.1702218514; _sctr=1%7C1704654000000; mi_p_source=undefined; mi_p_medium=undefined; mi_p_campaign=undefined; mi_p_term=undefined; mi_p_content=undefined; mi_p_gclid=undefined; mi_source=google; mi_medium=organic; mi_campaign=; mi_term=; mi_content=; mi_gclid=; _ga_5L4FRV3KPW=GS1.1.1704696624.4.1.1704696641.43.0.0; _ga_C6SBBPVWWK=GS1.1.1704696624.4.1.1704696641.43.0.0; lwrid=AQGNG20qIm%2B9EVBm5pjV18ZuIyxx; xlly_s=1; hng=PK|en-PK|PKR|586; curTraffic=lazada; userLanguageML=en-PK; epssw=1*i5fB11i_F1XdGEz47zEGttFFh2EnCVz4Jr8M-qEEzrPSCVeId5MyCxHMQWfcCJX4jhAWHU1CbYwQdp62dsnEY79vOxrMjCBWXxEWhv1Gx3CI3p1bjGfWhGxGDxd5HCCULOkS1bCv-3zjowtYvfNIIO1KQQJR_Sqm116A3Dmn-p3JyLB4xDmnxJsNYMDpeo2OetzRy99-xmxJXNWokfNntcSV_9UCrdM3xDDpeDmndLHB; age_limit=18Y; t_sid=tWCKXzfL5dKHJ6P9hwes1SRmzmkVjxe3; utm_channel=NA; _m_h5_tk=66b9f7f529c772a8433e5669e94795f7_1705656645823; _m_h5_tk_enc=db2346c0c8701b95f3e9fa73a1bb8829; daraz-marketing-tracker=hide; JSESSIONID=B7914CD57669FACF1D3CEC5DD16F1DCF; tfstk=e_Gv0zGWQ94fh9SUEIpo_4pikl8kEj32ymuCslqcC0n-R2S0ChJ4C53YqlV0Gn8tX2ojoRcmlfCt2uB0Ih84CGinXPqj6V5T5ViwulY2Equ4_5tHxpAnuq52kcP9tDchLeN6xHAu8Gx505MmRKLAxzLvenxAtRGUlpNCDv9s6jUYHk6021C9SreYAqZR4glJ_-EIluC14Whn9p31xkUGhULRQOybzYLe2xK_aqCLykYkrOWaemU8xULRQOybzzEHz0WNQ8nA.; l=fBIMqdfPPaW2mUTtBOfanurza77OSIRvXuPzaNbMi9fP_-Cp5-FCW1INsGL9C3HNFswHR3kC9mckBeYBYIXN3PYinmnv_CkmnXr9aX5..; isg=BNzcaeHm1UWTzKGUhk2R8EJyrfOOVYB_YmiEt7bd6EeoAXyL3mVQD1KzYHH5wrjX' session = requests.Session() for number in range(1, total_page): st.write(f"finished page {number}") sleep(random.random() * 5) url = f"https://www.daraz.pk/{category}/" params = { "ajax": "true", "page": str(number), "spm": "a2a0e.searchlistcategory.pagination.5.7d725753tvFnfz" } headers = { "User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Cookie": cookie + str(session.cookies.get_dict()) } request = session.get(url, headers=headers, params=params, allow_redirects=True) if request.status_code == 200: # Sometimes it failed on json load, this try except can handle this problem try: data = json.loads(request.text) if 'listItems' in data['mods'].keys(): products_data.append(data) else: failed_fetch.append(f'Data could not fetch from page {number}') except: failed_fetch.append(f'json failed at page {number}') continue else: failed_fetch.append(f'Request failed of page {number}') return products_data, failed_fetch def main_scrap(main_category): page_nums_dict = get_page_num(main_category) # get all sub-categories and there total page loop number extract_data = {} # a dict for saving all sub categories data failed_data ={} # a dict for storing failed to get data for category, page_num in page_nums_dict.items(): # loop through sub categories category_data, failed_scrap = fetch_all(page_num, category) # get first category data extract_data[category] = category_data # create a key with category name and save all list of list structure data failed_data[category] = failed_scrap # same st.write('Finish Scrap Successfully') # when the loop compeletes return extract_data, failed_data def get_relevant(list_40): # Gets data of a 40 product list structure data_dict = { 'name': [i.get('name', 'None') for i in list_40['mods']['listItems']], 'current_price': [i.get('utLogMap', 'None').get('current_price', 'None') for i in list_40['mods']['listItems']], 'original_price': [i.get('utLogMap', 'None').get('originalPrice', 'None') for i in list_40['mods']['listItems']], 'x_ad': [i.get('utLogMap', 'None').get('x_ad', 'None') for i in list_40['mods']['listItems']], 'voucherIds': [i.get('utLogMap', 'None').get('voucherIds', 'None') for i in list_40['mods']['listItems']], 'productUrl': [i.get('productUrl', 'None') for i in list_40['mods']['listItems']], 'ratingScore': [i.get('ratingScore', 'None') for i in list_40['mods']['listItems']], 'review': [i.get('review', 'None') for i in list_40['mods']['listItems']], 'location': [i.get('location', 'None') for i in list_40['mods']['listItems']], 'sku': [i.get('sku', 'None') for i in list_40['mods']['listItems']], 'description': [i.get('description', 'None') for i in list_40['mods']['listItems']], 'brandName': [i.get('brandName', 'None') for i in list_40['mods']['listItems']], 'sellerId': [i.get('sellerId', 'None') for i in list_40['mods']['listItems']], 'sellerName': [i.get('sellerName', 'None') for i in list_40['mods']['listItems']], 'image': [i.get('image', 'None') for i in list_40['mods']['listItems']], 'itemSold' : [i.get('soldInfo',{'soldNum':None}).get('soldNum','None') for i in list_40['mods']['listItems']], 'isAD' : [i.get('isAD','None') for i in list_40['mods']['listItems']], 'inStock': [i.get('inStock','None') for i in list_40['mods']['listItems']], } return data_dict def category_info(category_data): # get data of a single category that is split into list 40 all_products = { 'image': [], 'name': [], 'current_price': [], 'original_price': [], 'ratingScore': [], 'review': [], 'productUrl': [], 'sellerName': [], 'itemSold' : [], 'inStock': [], 'isAD' : [], 'x_ad': [], 'voucherIds': [], 'location': [], 'sku': [], 'description': [], 'brandName': [], 'sellerId': [], } for products_40 in category_data: return_dic = get_relevant(products_40) all_products['image'].extend(return_dic['image']) all_products['name'].extend(return_dic['name']) all_products['sellerName'].extend(return_dic['sellerName']) all_products['review'].extend(return_dic['review']) all_products['ratingScore'].extend(return_dic['ratingScore']) all_products['itemSold'].extend(return_dic['itemSold']) all_products['inStock'].extend(return_dic['inStock']) all_products['isAD'].extend(return_dic['isAD']) all_products['location'].extend(return_dic['location']) all_products['brandName'].extend(return_dic['brandName']) all_products['current_price'].extend(return_dic['current_price']) all_products['original_price'].extend(return_dic['original_price']) all_products['productUrl'].extend(return_dic['productUrl']) all_products['description'].extend(return_dic['description']) all_products['x_ad'].extend(return_dic['x_ad']) all_products['voucherIds'].extend(return_dic['voucherIds']) all_products['sku'].extend(return_dic['sku']) all_products['sellerId'].extend(return_dic['sellerId']) return all_products def extract_clean(main_category_name): category_combine = { 'image': [], 'name': [], 'current_price': [], 'original_price': [], 'ratingScore': [], 'review': [], 'productUrl': [], 'sellerName': [], 'itemSold' : [], 'inStock': [], 'isAD' : [], 'x_ad': [], 'voucherIds': [], 'location': [], 'sku': [], 'description': [], 'brandName': [], 'sellerId': [], } extract_data, failed_data = main_scrap(main_category_name) # extract_data is a dict, contains all the sub-cat data for c in extract_data.keys(): dict_cat = category_info(extract_data[c]) # return a dict where all the split 40 combines column: list_of_value for cat_keys in dict_cat.keys(): category_combine[cat_keys].extend(dict_cat[cat_keys]) import pandas as pd df = pd.DataFrame(category_combine) df = df.astype(str) df['productUrl'] = df['productUrl'].str.strip('//') df.description = df.description.str.strip('[]').str.strip("'") return df, extract_data.keys() # Streamlit app def main(): df = pd.DataFrame({ }) st.title('Products Data Scrapping > Daraz.pk') category = st.text_input('Please Input the category name') if st.button('Start Scrap'): df, categories = extract_clean(category) if not df.empty: st.dataframe(df) if __name__ == "__main__": main()