File size: 3,619 Bytes

from huggingface_hub import HfApi
import os
from pathlib import Path
from dotenv import load_dotenv
import time
from requests.exceptions import ConnectionError, HTTPError
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern

def get_gitignore_patterns():
    """读取 .gitignore 文件中的规则"""
    if os.path.exists('.gitignore'):
        with open('.gitignore', 'r', encoding='utf-8') as f:
            return f.read().splitlines()
    return []

def upload_to_huggingface(

    local_directory: str,

    repo_id: str,

    max_retries: int = 3,

    retry_delay: int = 5

):
    """

    上传整个目录到 Hugging Face

    

    Args:

        local_directory: 本地项目目录路径

        repo_id: Hugging Face 仓库ID (格式: username/repo_name)

        max_retries: 最大重试次数

        retry_delay: 重试间隔（秒）

    """
    # 加载环境变量
    load_dotenv()
    token = os.getenv("HUGGINGFACE_TOKEN")
    
    if not token:
        raise ValueError("请在 .env 文件中设置 HUGGINGFACE_TOKEN")

    # 读取 .gitignore 规则
    gitignore_patterns = get_gitignore_patterns()
    # 添加一些额外的忽略规则
    additional_patterns = [
        "VectorScience - 方案介绍 - 策略部分.docx",
        ".git/",
        "__pycache/",
        "*.pyc",
        ".env",
        ".gitignore",
        ".DS_Store",
        "ewv9ssdcuvg6",
        ".docx",
        "wandb/",
        "jk_zfls/"
    ]
    gitignore_patterns.extend(additional_patterns)
    
    # 创建 PathSpec 对象来匹配文件
    spec = PathSpec.from_lines(GitWildMatchPattern, gitignore_patterns)

    # 初始化 Hugging Face API
    api = HfApi()

    # 获取所有要上传的文件
    files_to_upload = []
    for root, _, files in os.walk(local_directory):
        for file in files:
            file_path = Path(root) / file
            relative_path = file_path.relative_to(local_directory)
            
            # 使用 PathSpec 检查文件是否应该被忽略
            if not spec.match_file(str(relative_path)):
                files_to_upload.append((str(file_path), str(relative_path)))

    # 上传文件
    for local_path, path_in_repo in files_to_upload:
        for attempt in range(max_retries):
            try:
                print(f"正在上传: {local_path} -> {path_in_repo}")
                api.upload_file(
                    path_or_fileobj=local_path,
                    path_in_repo=path_in_repo,
                    repo_id=repo_id,
                    token=token
                )
                print(f"成功上传: {path_in_repo}")
                break
            except (ConnectionError, HTTPError) as e:
                if attempt < max_retries - 1:
                    print(f"上传失败，{retry_delay}秒后重试... ({attempt + 1}/{max_retries})")
                    print(f"错误信息: {str(e)}")
                    time.sleep(retry_delay)
                else:
                    print(f"上传失败: {path_in_repo}")
                    print(f"错误信息: {str(e)}")
                    raise

if __name__ == "__main__":
    # 设置环境变量以使用代理
    os.environ["HTTPS_PROXY"] = "http://127.0.0.1:17890"
    
    # 使用示例
    upload_to_huggingface(
        local_directory=".",  # 当前目录
        repo_id="Facepalm0/Ubiquant_CharacterHunter",  # 替换为你的仓库ID
        max_retries=3,  # 最大重试5次
        retry_delay=5  # 每次重试间隔5秒
    )