from loguru import logger from magic_pdf.libs.drop_reason import DropReason def get_data_source(jso: dict): data_source = jso.get("data_source") if data_source is None: data_source = jso.get("file_source") return data_source def get_data_type(jso: dict): data_type = jso.get("data_type") if data_type is None: data_type = jso.get("file_type") return data_type def get_bookid(jso: dict): book_id = jso.get("bookid") if book_id is None: book_id = jso.get("original_file_id") return book_id def exception_handler(jso: dict, e): logger.exception(e) jso["_need_drop"] = True jso["_drop_reason"] = DropReason.Exception jso["_exception"] = f"ERROR: {e}" return jso def get_bookname(jso: dict): data_source = get_data_source(jso) file_id = jso.get("file_id") book_name = f"{data_source}/{file_id}" return book_name def spark_json_extractor(jso: dict) -> dict: """ 从json中提取数据,返回一个dict """ return { "_pdf_type": jso["_pdf_type"], "model_list": jso["doc_layout_result"], }