Spaces:
Running
Running
import requests | |
import os | |
from typing import Iterable | |
# Define the GitHub API endpoint for the organization's repositories | |
org_name = "oceanhackweek" | |
url = f"https://api.github.com/orgs/oceanhackweek/repos" | |
# Set your personal access token here if needed | |
access_token = os.getenv('git_token') | |
headers = { | |
"Accept": "application/vnd.github.v3+json", | |
"Authorization": f"token {access_token}" # Comment out this line if not using an access token | |
} | |
## Create a directory to store the README files | |
if not os.path.exists('readmes_proj'): | |
os.makedirs('readmes_proj') | |
# Dictionary to store the mapping of filename to repository link | |
repo_links = {} | |
def download_readme(repo_name, repo_html_url): | |
# Construct the URL for the README file in the repository | |
readme_url = f"https://raw.githubusercontent.com/{org_name}/{repo_name}/main/README.md" | |
try: | |
response = requests.get(readme_url) | |
response.raise_for_status() # Raise an error for bad responses | |
file_name = f"{repo_name}_README.md" | |
file_path = os.path.join('readmes_proj', file_name) | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.write(response.text) | |
# Save the repo link in the dictionary | |
repo_links[file_name] = repo_html_url | |
print(f"Downloaded: {repo_name}") | |
except requests.exceptions.HTTPError as e: | |
print(f"Failed to download {repo_name}: {e}") | |
def get_repositories(url): | |
repos = [] | |
while url: | |
response = requests.get(url, headers=headers) | |
response.raise_for_status() | |
repos.extend(response.json()) | |
# Check if there is a next page | |
url = response.links.get('next', {}).get('url') | |
return repos | |
repos = get_repositories(url) | |
for repo in repos: | |
repo_name = repo["name"] | |
repo_html_url = repo["html_url"] | |
if "proj" in repo_name: | |
download_readme(repo_name,repo_html_url) | |
def load_md_to_langchain_document(readme_dict, filename): | |
# Load the markdown content from a file | |
with open(f'./readmes_proj/{filename}', 'r', encoding='utf-8') as file: | |
markdown_content = file.read() | |
corrected_content = re.sub(r'(\[.*?\]\(.*?\\)', r'\1 ', markdown_content) | |
# Create a LangChain Document | |
langchain_document = Document( | |
page_content=corrected_content, | |
metadata={"source": readme_dict[filename]} | |
) | |
return langchain_document | |
# Example usage | |
documents = [] | |
for filename in repo_links: | |
langchain_doc = load_md_to_langchain_document(repo_links, filename) | |
documents.append(langchain_doc) | |
def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None: | |
''' | |
save langchain documents as json file | |
''' | |
with open(file_path, 'w') as jsonl_file: | |
for doc in array: | |
jsonl_file.write(doc.json() + '\n') | |
save_docs_to_jsonl(documents,'project_readmes.json') | |