chatbot_ohw_projects / scrape_github.py
boryasbora's picture
Create scrape_github.py
ee03791 verified
import requests
import os
from typing import Iterable
# Define the GitHub API endpoint for the organization's repositories
org_name = "oceanhackweek"
url = f"https://api.github.com/orgs/oceanhackweek/repos"
# Set your personal access token here if needed
access_token = os.getenv('git_token')
headers = {
"Accept": "application/vnd.github.v3+json",
"Authorization": f"token {access_token}" # Comment out this line if not using an access token
}
## Create a directory to store the README files
if not os.path.exists('readmes_proj'):
os.makedirs('readmes_proj')
# Dictionary to store the mapping of filename to repository link
repo_links = {}
def download_readme(repo_name, repo_html_url):
# Construct the URL for the README file in the repository
readme_url = f"https://raw.githubusercontent.com/{org_name}/{repo_name}/main/README.md"
try:
response = requests.get(readme_url)
response.raise_for_status() # Raise an error for bad responses
file_name = f"{repo_name}_README.md"
file_path = os.path.join('readmes_proj', file_name)
with open(file_path, 'w', encoding='utf-8') as file:
file.write(response.text)
# Save the repo link in the dictionary
repo_links[file_name] = repo_html_url
print(f"Downloaded: {repo_name}")
except requests.exceptions.HTTPError as e:
print(f"Failed to download {repo_name}: {e}")
def get_repositories(url):
repos = []
while url:
response = requests.get(url, headers=headers)
response.raise_for_status()
repos.extend(response.json())
# Check if there is a next page
url = response.links.get('next', {}).get('url')
return repos
repos = get_repositories(url)
for repo in repos:
repo_name = repo["name"]
repo_html_url = repo["html_url"]
if "proj" in repo_name:
download_readme(repo_name,repo_html_url)
def load_md_to_langchain_document(readme_dict, filename):
# Load the markdown content from a file
with open(f'./readmes_proj/{filename}', 'r', encoding='utf-8') as file:
markdown_content = file.read()
corrected_content = re.sub(r'(\[.*?\]\(.*?\\)', r'\1 ', markdown_content)
# Create a LangChain Document
langchain_document = Document(
page_content=corrected_content,
metadata={"source": readme_dict[filename]}
)
return langchain_document
# Example usage
documents = []
for filename in repo_links:
langchain_doc = load_md_to_langchain_document(repo_links, filename)
documents.append(langchain_doc)
def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
'''
save langchain documents as json file
'''
with open(file_path, 'w') as jsonl_file:
for doc in array:
jsonl_file.write(doc.json() + '\n')
save_docs_to_jsonl(documents,'project_readmes.json')