In [None]:
!pip install -q llama-index==0.12.12 openai==1.59.6 tiktoken==0.8.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/454.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m450.6/454.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.8/454.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.8/266.8 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/41.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.2/304.2 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# set variables
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN2')

In [None]:
# Initialise HG

from huggingface_hub import HfApi

api = HfApi(token=HF_TOKEN)

In [None]:
  # Download files
file_name = 'Crawler.zip'
api.hf_hub_download(
    filename=file_name,
    local_dir="./data",
    repo_id="vicpada/AzureResources",
    repo_type="dataset"
)


Crawler.zip:   0%|          | 0.00/440M [00:00<?, ?B/s]

'data/Crawler.zip'

In [None]:
# prompt: unzip

import zipfile
import os

# Define the path to the downloaded zip file
zip_file_path = os.path.join("./data", file_name)
extract_dir = "./data/extracted"

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"File '{file_name}' unzipped to '{extract_dir}'")


File 'Crawler.zip' unzipped to './data/extracted'


In [None]:
from typing import List
# prompt: foreach folder in extracted, create a JSONL file. The JSONL file will have a serialised llamaindex document per line. Each document corresponds to a file inside the folder

import json
import re
import tiktoken
import uuid

# Define the extraction directory
extract_dir = "./data/extracted"
output_dir = "./data/jsonl_output"

# Create output directory
os.makedirs(output_dir, exist_ok=True)

def extract_title(content: str):
    title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
    if title_match:
        return title_match.group(1).strip()

    lines = content.split("\n")
    for line in lines:
        if line.strip():
            return line.strip()

    return None

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string, disallowed_special=()))
    return num_tokens

def remove_copyright_header(content: str) -> str:
    header_pattern = re.compile(r"<!--Copyright.*?-->\s*", re.DOTALL)
    cleaned_content = header_pattern.sub("", content, count=1)
    return cleaned_content.strip()

def remove_url_and_title_header(content: str) -> str:
    header_pattern = re.compile(r"(?s)^---\s*(?=.*\b(url|title):).*?---\s*\n*", re.DOTALL)
    cleaned_content = header_pattern.sub("", content, count=1)
    return cleaned_content.strip()



def process_files(folder_name:str, folder_path:str, files:List) -> List[dict[str, str]]:
    jsonl_data = []
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as infile:
          content = infile.read()

          # Create a Json object
          title = extract_title(content)
          token_count = num_tokens_from_string(content, "cl100k_base")

          # Extract URL and Title, handling potential None results from re.search
          url_match = re.search(r'^url:\s*"([^"]+)"', content, re.MULTILINE)
          extracted_url = url_match.group(1) if url_match else None

          title_match = re.search(r'^title:\s*"([^"]+)"', content, re.MULTILINE)
          extracted_title = title_match.group(1) if title_match else extract_title(content) # Use extract_title function as fallback

          # Skip very small or extremely large files
          if token_count < 100 or token_count > 200_000:
              print(
                  f"Skipping {file_path} due to token count {token_count}"
              )
              continue

          cleaned_content = remove_copyright_header(content)
          cleaned_content = remove_url_and_title_header(content)

          json_object = {
                        "tokens": token_count,
                        "doc_id" :str(uuid.uuid5(uuid.NAMESPACE_DNS, cleaned_content)),
                        "name": (extracted_title if extracted_title else file_name),
                        "url": extracted_url,
                        "retrieve_doc": (token_count <= 8000),
                        "source": folder_name,
                        "content": cleaned_content,
                    }
          jsonl_data.append(json_object)
    return jsonl_data



# Iterate through each folder in the extracted directory
for folder_name in os.listdir(extract_dir):
    folder_path = os.path.join(extract_dir, folder_name)

    # Check if it's a directory
    if os.path.isdir(folder_path):
        jsonl_filename = f"{folder_name}.jsonl"
        jsonl_filepath = os.path.join(output_dir, jsonl_filename)

        with open(jsonl_filepath, 'w') as outfile:
            # Iterate through each file in the current folder
            json_data = process_files(folder_name, folder_path, os.listdir(folder_path))
            for json_object in json_data:
                json_str = json.dumps(json_object)
                outfile.write(json_str + '\n')

        print(f"Created JSONL file: {jsonl_filepath}")

Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___myignite.microsoft.com_videos_2658_%22.md due to token count 60
Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22_t5_s_gxcuf89792_rss_Community_%22.md due to token count 61
Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_store_b_why-microsoft-store_icid=footer_why-msft-store_7102020_%22.md due to token count 79
Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___portal.office.com_landing_%22.md due to token count 57
Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___www.microsoft.com_en-us_store_b_education_%22.md due to token count 61
Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22https___sharegate.com_microsoft-migration_%22.md due to token count 59
Skipping ./data/extracted/tech-community/techcommunity.microsoft.com_%22http___bit.ly_SVS17CHI_%22.

In [None]:
# prompt: upload to hugging face

# List files in the output directory
output_files = [f for f in os.listdir(output_dir) if f.endswith('.jsonl')]

# Upload each JSONL file to Hugging Face Datasets
for file_name in output_files:
    file_path = os.path.join(output_dir, file_name)
    try:
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file_name,
            repo_id="vicpada/AzureResources", # Replace with your repo ID
            repo_type="dataset",
            commit_message=f"Add {file_name}"
        )
        print(f"Successfully uploaded {file_name}")
    except Exception as e:
        print(f"Error uploading {file_name}: {e}")

microsoft-learn.jsonl:   0%|          | 0.00/270M [00:00<?, ?B/s]

Successfully uploaded microsoft-learn.jsonl


tech-community.jsonl:   0%|          | 0.00/2.14G [00:00<?, ?B/s]

Successfully uploaded tech-community.jsonl


azure-updates.jsonl:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Successfully uploaded azure-updates.jsonl


github-samples.jsonl:   0%|          | 0.00/172M [00:00<?, ?B/s]

Successfully uploaded github-samples.jsonl


azure-architecture.jsonl:   0%|          | 0.00/15.6M [00:00<?, ?B/s]

Successfully uploaded azure-architecture.jsonl
