批量下载Github与Bitbucket项目

原理是使用Github/Bitbucket提供的API获取所有的项目列表,然后使用系统的git命令克隆项目源码。

Github

使用本脚本需要的配置 Github Access Token

# Clone all repos from github.
# Usage:
# python3 github.py -t [GITHUB_TOKEN]  -o [OUTPUT_DIR]

from requests.auth import HTTPBasicAuth
import requests
import os


repos = set()

def load_repos(token, page=1):
    per_page = 100
    url = f'https://api.github.com/user/repos?page={page}&per_page={per_page}'
    headers = {'Authorization': f'token {token}'}
    rs = requests.get(url, headers=headers).json()

    for r in rs:
        name = r['name']
        url = r['ssh_url']
        repos.add((name, url))

    if len(rs) == per_page:
        load_repos(token, page+1)


# load all repos and save in `repos`
def load_all_repos(token):
    load_repos(token, 1)

def download_repo(output, name, href):
    print(f'Cloning {name} {href}')
    os.system(f'git clone --depth=1 {href} "{output}/{name}"')


if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("-t", "--token", help="Github developer token", type=str, required=True)
    ap.add_argument("-o", "--output", help="Output directory", type=str, required=True)
    args = vars(ap.parse_args())

    token = args['token']
    output = args['output']

    load_all_repos(token)

    for repo in repos:
        download_repo(output, *repo)

Bitbucket

使用本脚本需要配置 Bitbucket App Password才能访问。

# Shallow clone all repos from bitbucket.
# Usage:
# python3 bitbucket.py -u [USERNAME] -p [BITbucket_APP_TOKEN] -o [OUTPUT_DIR]
# Requirements:
# 1. Setup app password in Bitbucket
# 2. Setup local ssh keys
# 3. Only git repositories are supported

from requests.auth import HTTPBasicAuth
import requests
import os

roles =  'owner member contributor admin'.split()
pagelen = 100
repos = set()

# recursively load all repos for this role
def load_repos(url, user, pwd):
    rs = requests.get(url, auth=HTTPBasicAuth(user, pwd)).json()
    for r in rs['values']:
        name = r['name']
        links = r['links']['clone']
        href = next(link['href'] for link in links if link['name'] == 'ssh')
        print(f'Found repo {name} {href}')
        repos.add((name, href))

    if 'next' in rs:
        load_repos(rs['next'], user, pwd)


# load all repos and save in `repos`
def load_all_repos(user, pwd):
    for role in roles:
        url = f'https://api.bitbucket.org/2.0/repositories?pagelen=100&role={role}'
        load_repos(url, user, pwd)

def download_repo(output, name, href):
    print(f'Cloning {name} {href}')
    os.system(f'git clone --depth=1 {href} "{output}/{name}"')


if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("-u", "--user", help="Bitbucket username", type=str, required=True)
    ap.add_argument("-p", "--password", help="Bitbucket app password", type=str, required=True)
    ap.add_argument("-o", "--output", help="Output directory", type=str, required=True)
    args = vars(ap.parse_args())

    user = args['user']
    pwd = args['password']
    output = args['output']

    load_all_repos(user, pwd)

    for repo in repos:
        download_repo(output, *repo)