Spaces:

XAI
/

TAB4IDC-InterventionDemo

Sleeping

App Files Files Community

pooyanrg commited on Mar 14

Commit

ad4721b

1 Parent(s): 0884186

initial commit

Browse files

Files changed (16) hide show

Dockerfile +13 -0
app.py +324 -0
fileservice.py +48 -0
js/interactive_grid.js +194 -0
requirements.txt +20 -0
utils/cross-base/cross_config.json +12 -0
utils/decoder-base/decoder_config.json +14 -0
utils/file_utils.py +239 -0
utils/model.py +204 -0
utils/module_clip.py +679 -0
utils/module_cross.py +394 -0
utils/module_decoder.py +447 -0
utils/module_gated_attention.py +301 -0
utils/tokenization_clip.py +149 -0
utils/until_config.py +126 -0
utils/until_module.py +278 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import gradio as gr
+import numpy as np
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+from utils.model import init_model
+from utils.tokenization_clip import SimpleTokenizer as ClipTokenizer
+from fastapi.staticfiles import StaticFiles
+from fileservice import app
+html_text = """
+    <div id="container">
+        <canvas id="canvas" width="512" height="512"></canvas><img id="canvas-background" style="display:none;"/>
+    </div>
+"""
+def image_to_tensor(image_path):
+    image = Image.open(image_path).convert('RGB')
+    preprocess = Compose([
+        Resize([224, 224], interpolation=Image.BICUBIC),
+        lambda image: image.convert("RGB"),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+    image_data = preprocess(image)
+    return {'image': image_data}
+def get_image_data(image_path):
+    image_input = image_to_tensor(image_path)
+    return image_input
+def get_intervention_vector(selected_cells_bef, selected_cells_aft):
+    left = np.reshape(np.zeros((1, 14 * 14)), (14, 14))
+    right = np.reshape(np.zeros((1, 14 * 14)), (14, 14))
+    for (i, j) in selected_cells_bef:
+        left[i, j] = 1.
+    for (i, j) in selected_cells_aft:
+        right[i, j] = 1.
+    left_map = np.zeros((1, 14 * 14  + 1))
+    right_map = np.zeros((1, 14 * 14  + 1))
+    left_map[0, 1:] = np.reshape(left, (1, 14 * 14))
+    right_map[0, 1:] = np.reshape(right, (1, 14 * 14))
+    if len(selected_cells_bef) == 0:
+        left_map[0, 0] = 0.0
+    if len(selected_cells_aft) == 0:
+        right_map[0, 0] = 0.0
+    return left_map, right_map
+def _get_rawimage(image_path):
+    # Pair x L x T x 3 x H x W
+    image = np.zeros((1, 3, 224,
+                    224), dtype=np.float)
+    for i in range(1):
+        raw_image_data = get_image_data(image_path)
+        raw_image_data = raw_image_data['image']
+        image[i] = raw_image_data
+    return image
+def greedy_decode(model, tokenizer, video, video_mask, gt_left_map, gt_right_map):
+    visual_output, left_map, right_map = model.get_sequence_visual_output(video, video_mask,
+                                                            gt_left_map[:, 0, :].squeeze(), gt_right_map[:, 0, :].squeeze())
+    video_mask = torch.ones(visual_output.shape[0], visual_output.shape[1], device=visual_output.device).long()
+    input_caption_ids = torch.zeros(visual_output.shape[0], device=visual_output.device).data.fill_(tokenizer.vocab["<|startoftext|>"])
+    input_caption_ids = input_caption_ids.long().unsqueeze(1)
+    decoder_mask = torch.ones_like(input_caption_ids)
+    for i in range(32):
+        decoder_scores = model.decoder_caption(visual_output, video_mask, input_caption_ids, decoder_mask, get_logits=True)
+        next_words = decoder_scores[:, -1].max(1)[1].unsqueeze(1)
+        input_caption_ids = torch.cat([input_caption_ids, next_words], 1)
+        next_mask = torch.ones_like(next_words)
+        decoder_mask = torch.cat([decoder_mask, next_mask], 1)
+    return input_caption_ids[:, 1:].tolist(), left_map, right_map
+# Dummy prediction function
+def predict_image(image_bef, image_aft, selected_cells_bef, selected_cells_aft):
+    if image_bef is None:
+        return "No image provided", "", ""
+    if image_aft is None:
+        return "No image provided", "", ""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = init_model('data/pytorch_model.pt', device)
+    tokenizer = ClipTokenizer()
+    left_map, right_map = get_intervention_vector(selected_cells_bef, selected_cells_aft)
+    left_map, right_map = torch.from_numpy(left_map).unsqueeze(0), torch.from_numpy(right_map).unsqueeze(0)
+    bef_image = torch.from_numpy(_get_rawimage(image_bef)).unsqueeze(1)
+    aft_image = torch.from_numpy(_get_rawimage(image_aft)).unsqueeze(1)
+    image_pair = torch.cat([bef_image, aft_image], 1)
+    image_mask = torch.from_numpy(np.ones(2, dtype=np.long)).unsqueeze(0)
+    result_list, left_map, right_map = greedy_decode(model, tokenizer, image_pair, image_mask, left_map, right_map)
+    decode_text_list = tokenizer.convert_ids_to_tokens(result_list[0])
+    if "<|endoftext|>" in decode_text_list:
+        SEP_index = decode_text_list.index("<|endoftext|>")
+        decode_text_list = decode_text_list[:SEP_index]
+    if "!" in decode_text_list:
+        PAD_index = decode_text_list.index("!")
+        decode_text_list = decode_text_list[:PAD_index]
+    decode_text = decode_text_list.strip()
+    # Generate dummy predictions
+    pred = f"{decode_text}"
+    # Include information about selected cells
+    selected_info_bef = f"{selected_cells_bef}" if selected_cells_bef else "No image patch was selected"
+    selected_info_aft = f"{selected_cells_aft}" if selected_cells_aft else "No image patch was selected"
+    return pred, selected_info_bef, selected_info_aft
+# Add grid to the image
+def add_grid_to_image(image_path, grid_size=14):
+    if image_path is None:
+        return None
+    image = Image.open(image_path)
+    w, h = image.size
+    image = image.convert('RGBA')
+    draw = ImageDraw.Draw(image)
+    x_positions = np.linspace(0, w, grid_size + 1)
+    y_positions = np.linspace(0, h, grid_size + 1)
+    # Draw the vertical lines
+    for x in x_positions[1:-1]:
+        line = ((x, 0), (x, h))
+        draw.line(line, fill='white')
+    # Draw the horizontal lines
+    for y in y_positions[1:-1]:
+        line = ((0, y), (w, y))
+        draw.line(line, fill='white')
+    return image, h, w
+# Handle cell selection
+def handle_click(image, evt: gr.SelectData, selected_cells, image_path):
+    if image is None:
+        return None, []
+    grid_size = 14
+    image, h, w = add_grid_to_image(image_path, grid_size)
+    x_positions = np.linspace(0, w, grid_size + 1)
+    y_positions = np.linspace(0, h, grid_size + 1)
+    # Calculate which cell was clicked
+    for index, x in enumerate(x_positions[:-1]):
+        if evt.index[0] >= x and evt.index[0] <= x_positions[index+1]:
+            row = index
+    for index, y in enumerate(y_positions[:-1]):
+        if evt.index[1] >= y and evt.index[1] <= y_positions[index+1]:
+            col = index
+    cell_idx = (row, col)
+    # Toggle selection
+    if cell_idx in selected_cells:
+        selected_cells.remove(cell_idx)
+    else:
+        selected_cells.append(cell_idx)
+    # Add semi-transparent overlay for selected cells
+    highlight_layer = Image.new('RGBA', (w, h), (0, 0, 0, 0))  # Fully transparent layer
+    highlight_draw = ImageDraw.Draw(highlight_layer)
+    # Define a lighter green color with 40% transparency
+    light_green = (144, 238, 144, 102)  # RGB = (144, 238, 144), Alpha = 102 (40% of 255)
+    for (row, col) in selected_cells:
+        cell_top_left = (x_positions[row], y_positions[col])
+        cell_bottom_right = (x_positions[row + 1], y_positions[col + 1])
+        highlight_draw.rectangle([cell_top_left, cell_bottom_right], fill=light_green, outline='white')
+    result_img = Image.alpha_composite(image.convert('RGBA'), highlight_layer)
+    return result_img, selected_cells
+# Process example images
+def process_example(image_path_bef, image_path_aft):
+    # Add grid to the example image
+    image_bef_grid, _, _ = add_grid_to_image(image_path_bef, 14)
+    image_aft_grid, _, _ = add_grid_to_image(image_path_aft, 14)
+    return image_bef_grid, image_aft_grid  # Reset selected cells and store original image
+def display_image(image_path):
+    image_grid, _, _ = add_grid_to_image(image_path, 14)
+    return image_grid, []
+with gr.Blocks() as demo:
+    gr.Markdown("# TAB: Transformer Attention Bottleneck")
+    # Instructions
+    gr.Markdown("""
+    ## Instructions:
+    1. Upload an image or select one from the examples
+    2. Click on grid cells to select/deselect them
+    3. Click the 'Predict' button to get model predictions
+    """)
+    selected_cells_bef = gr.State([])
+    selected_cells_aft = gr.State([])
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input components with grid overlay
+            image_bef = gr.Image(type="filepath")
+            image_aft = gr.Image(type="filepath")
+            predict_btn = gr.Button("Predict")
+        with gr.Column(scale=1):
+            image_display_with_grid_bef = gr.Image(type="pil", label="Before Image with Grid")
+            image_display_with_grid_aft = gr.Image(type="pil", label="After Image with Grid")
+            # Add click event to the displayed image
+            image_display_with_grid_bef.select(
+                handle_click,
+                inputs=[image_display_with_grid_bef, selected_cells_bef, image_bef],
+                outputs=[image_display_with_grid_bef, selected_cells_bef]
+            )
+            image_display_with_grid_aft.select(
+                handle_click,
+                inputs=[image_display_with_grid_aft, selected_cells_aft, image_aft],
+                outputs=[image_display_with_grid_aft, selected_cells_aft]
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Example images
+            examples = gr.Examples(
+                examples=[["data/images/CLEVR_default_000572.png", "data/images/CLEVR_semantic_000572.png"],
+                            ["data/images/CLEVR_default_003339.png", "data/images/CLEVR_semantic_003339.png"]],
+                inputs=[image_bef, image_aft],
+                outputs=[image_display_with_grid_bef, image_display_with_grid_aft],
+                label="Example Images",
+                fn=process_example,
+                examples_per_page=5
+                )
+            # image_bef.change(
+            #     fn=display_image,
+            #     inputs=[image_bef],
+            #     outputs=[image_display_with_grid_bef, selected_cells_bef]
+            # )
+            # image_aft.change(
+            #     fn=display_image,
+            #     inputs=[image_aft],
+            #     outputs=[image_display_with_grid_aft, selected_cells_aft]
+            # )
+            image_bef.change(
+                fn=None,
+                inputs=[image_bef],
+                outputs=[],
+                js="(image) => { initializeEditor(); importBackground(image); return []; }",
+            )
+            image_aft.change(
+                fn=None,
+                inputs=[image_aft],
+                outputs=[],
+                js="(image) => { initializeEditor(); importBackground(image); return []; }",
+            )
+        with gr.Column(scale=1):
+            # Output components
+            prediction = gr.Textbox(label="Predicted caption")
+            selected_info_bef = gr.Textbox(label="Selected patches on before")
+            selected_info_aft = gr.Textbox(label="Selected patches on after")
+    # Connect the predict button to the prediction function
+    predict_btn.click(
+        fn=predict_image,
+        inputs=[image_bef, image_aft, selected_cells_bef, selected_cells_aft],
+        outputs=[prediction, selected_info_bef, selected_info_aft]
+    )
+app.mount("/js", StaticFiles(directory="js"), name="js")
+gr.mount_gradio_app(app, demo, path="/")

fileservice.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from fastapi import FastAPI, Request, Response
+filenames = ["js/interactive_grid.js"]
+contents = "\n".join(
+    [f"<script type='text/javascript' src='{x}'></script>" for x in filenames]
+)
+ga_script = """
+<!-- Google tag (gtag.js) -->
+<script async src="https://www.googletagmanager.com/gtag/js?id=G-11ZHMNWP9Y"></script>
+<script>
+  window.dataLayer = window.dataLayer || [];
+  function gtag(){dataLayer.push(arguments);}
+  gtag('js', new Date());
+  gtag('config', 'G-11ZHMNWP9Y');
+</script>
+"""
+app = FastAPI()
+@app.middleware("http")
+async def insert_js(request: Request, call_next):
+    path = request.scope["path"]  # get the request route
+    response = await call_next(request)
+    if path == "/":
+        response_body = ""
+        async for chunk in response.body_iterator:
+            response_body += chunk.decode()
+        charset_tag = '<meta charset="utf-8" />'
+        if charset_tag in response_body:
+            response_body = response_body.replace(charset_tag, charset_tag + ga_script)
+        response_body = response_body.replace("</body>", contents + "</body>")
+        del response.headers["content-length"]
+        return Response(
+            content=response_body,
+            status_code=response.status_code,
+            headers=dict(response.headers),
+            media_type=response.media_type,
+        )
+    return response

js/interactive_grid.js ADDED Viewed

	@@ -0,0 +1,194 @@

+const gridSize = 14;
+var cellSize = null;
+var inputImage = null;
+var image = null;
+var canvas = null;
+var ctx = null;
+var canvasBg = null;
+let grid = new Array(gridSize).fill(null).map(() => new Array(gridSize).fill(false));
+var isInitialized = false;
+let selectedCells = 0;
+function createGrid() {
+    console.log('createGrid')
+    for (let i = 0; i < 196; i++) {
+        const div = document.createElement('div');
+        div.classList.add('checkbox');
+        div.innerHTML = '<input type="checkbox">';
+        grid.appendChild(div);
+    }
+}
+function loadImage(event) {
+    const file = event.target.files[0];
+    const reader = new FileReader();
+    reader.onload = function (e) {
+        image.src = e.target.result;
+    }
+    reader.readAsDataURL(file);
+}
+function handleMouseDown(event) {
+    // console.log("handleMouseDown");
+}
+function handleMouseMove(event) {
+    // console.log("handleMouseMove");
+}
+function handleMouseUp(event) {
+    // console.log("handleMouseUp");
+}
+function handleMouseLeave(event) {
+    // console.log("handleMouseLeave");
+}
+function drawGrid() {
+    ctx.clearRect(0, 0, canvas.width, canvas.height);
+    drawBackground();
+    for (let row = 0; row < gridSize; row++) {
+        for (let col = 0; col < gridSize; col++) {
+            ctx.beginPath();
+            ctx.rect(col * cellSize, row * cellSize, cellSize, cellSize);
+            ctx.strokeStyle = 'black';
+            ctx.lineWidth = 2;
+            ctx.stroke();
+            if (grid[row][col]) {
+                ctx.fillStyle = 'rgba(0, 255, 0, 0.5)';
+                ctx.fillRect(col * cellSize, row * cellSize, cellSize, cellSize);
+            }
+        }
+    }
+}
+function initializeEditor() {
+    console.log("initializeEditor");
+    if (isInitialized) {
+        return;
+    }
+    isInitialized = true;
+    image = document.getElementById('image');
+    canvas = document.getElementById('canvas');
+    ctx = canvas.getContext('2d');
+    // Add click event listener to canvas
+    canvas.addEventListener('mousedown', handleMouseDown);
+    canvas.addEventListener('mousemove', handleMouseMove);
+    canvas.addEventListener('mouseup', handleMouseUp);
+    canvas.addEventListener('mouseleave', handleMouseLeave);
+    cellSize = canvas.width / gridSize;
+    canvas.addEventListener('click', (event) => {
+        const rect = canvas.getBoundingClientRect();
+        const scaleX = canvas.width / rect.width;
+        const scaleY = canvas.height / rect.height;
+        const x = (event.clientX - rect.left) * scaleX;
+        const y = (event.clientY - rect.top) * scaleY;
+        const row = Math.floor(y / cellSize);
+        const col = Math.floor(x / cellSize);
+        // If the cell is already selected, it's always allowed to deselect it
+        if (grid[row][col]) {
+            grid[row][col] = false;
+            selectedCells--; // Decrement the selected cell count
+        } else {
+            // Only select a new cell if less than 50 cells are already selected
+            if (selectedCells < 50) {
+                grid[row][col] = true;
+                selectedCells++; // Increment the selected cell count
+            }
+        }
+        drawGrid();
+    });
+    drawGrid();
+}
+function drawBackground() {
+    if (canvasBg != null) {
+        const canvasWidth = canvas.width;
+        const canvasHeight = canvas.height;
+        const bgWidth = canvasBg.width;
+        const bgHeight = canvasBg.height;
+        const scaleX = canvasWidth / bgWidth;
+        const scaleY = canvasHeight / bgHeight;
+        const scale = Math.max(scaleX, scaleY);
+        const newWidth = bgWidth * scale;
+        const newHeight = bgHeight * scale;
+        const xOffset = (canvasWidth - newWidth) / 2;
+        const yOffset = (canvasHeight - newHeight) / 2;
+        ctx.drawImage(canvasBg, 0, 0, bgWidth, bgHeight, xOffset, yOffset, newWidth, newHeight);
+    }
+}
+function importBackground(image) {
+    if (image == null) {
+        canvasBg = null;
+        drawGrid();
+        return;
+    }
+    let m = new Image();
+    m.src = image;
+    m.onload = function () {
+        canvasBg = m;
+        drawGrid();
+    }
+}
+function read_js_Data() {
+    console.log("read_js_Data");
+    console.log("read_js_Data");
+    console.log("read_js_Data");
+    console.log("read_js_Data");
+    console.log("read_js_Data");
+    return grid;
+}
+function set_grid_from_data(data) {
+    if (data.length !== gridSize || data[0].length !== gridSize) {
+        throw new Error('Invalid data dimensions. Expected ' + gridSize + 'x' + gridSize);
+    }
+    selectedCells = 0; // Reset the selected cell count
+    for (let row = 0; row < gridSize; row++) {
+        for (let col = 0; col < gridSize; col++) {
+            grid[row][col] = data[row][col];
+            if (grid[row][col]) {
+                selectedCells++; // Count the number of initially selected cells
+            }
+        }
+    }
+    drawGrid();
+}
+function clear_grid() {
+    console.log("clearGrid");
+    for (let row = 0; row < gridSize; row++) {
+        for (let col = 0; col < gridSize; col++) {
+            grid[row][col] = false;
+        }
+    }
+    selectedCells = 0; // Reset the selected cell count
+    drawGrid();
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+boto3==1.36.9
+botocore==1.36.9
+certifi==2024.12.14
+charset-normalizer==3.4.1
+ftfy==6.2.3
+idna==3.10
+jmespath==1.0.1
+numpy==1.23.0
+opencv-python==4.11.0.86
+Pillow==9.3.0
+regex==2024.11.6
+requests==2.32.3
+s3transfer==0.11.2
+gradio
+torch
+torchvision
+torchaudio
+tqdm==4.67.1
+fastapi
+uvicorn[standard]

utils/cross-base/cross_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 150,
+  "num_attention_heads": 8,
+  "num_hidden_layers": 2,
+  "vocab_size": 512
+}

utils/decoder-base/decoder_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "num_attention_heads": 8,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 49408,
+  "num_decoder_layers": 3,
+  "max_target_embeddings": 77
+}

utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+import os
+import logging
+import shutil
+import tempfile
+import json
+from urllib.parse import urlparse
+from pathlib import Path
+from typing import Optional, Tuple, Union, IO, Callable, Set
+from hashlib import sha256
+from functools import wraps
+from tqdm import tqdm
+import boto3
+from botocore.exceptions import ClientError
+import requests
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                               Path.home() / '.pytorch_pretrained_bert'))
+def url_to_filename(url: str, etag: str = None) -> str:
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+    return filename
+def filename_to_url(filename: str, cache_dir: Union[str, Path] = None) -> Tuple[str, str]:
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise FileNotFoundError("file {} not found".format(cache_path))
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise FileNotFoundError("file {} not found".format(meta_path))
+    with open(meta_path) as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+    return url, etag
+def cached_path(url_or_filename: Union[str, Path], cache_dir: Union[str, Path] = None) -> str:
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    parsed = urlparse(url_or_filename)
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise FileNotFoundError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+def split_s3_path(url: str) -> Tuple[str, str]:
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+def s3_request(func: Callable):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+    @wraps(func)
+    def wrapper(url: str, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise FileNotFoundError("file {} not found".format(url))
+            else:
+                raise
+    return wrapper
+@s3_request
+def s3_etag(url: str) -> Optional[str]:
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+@s3_request
+def s3_get(url: str, temp_file: IO) -> None:
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+def http_get(url: str, temp_file: IO) -> None:
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk: # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    os.makedirs(cache_dir, exist_ok=True)
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError("HEAD request failed for url {} with status code {}"
+                          .format(url, response.status_code))
+        etag = response.headers.get("ETag")
+    filename = url_to_filename(url, etag)
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w') as meta_file:
+                json.dump(meta, meta_file)
+            logger.info("removing temp file %s", temp_file.name)
+    return cache_path
+def read_set_from_file(filename: str) -> Set[str]:
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+def get_file_extension(path: str, dot=True, lower: bool = True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext

utils/model.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+from torch import nn
+from .until_module import PreTrainedModel
+from .module_cross import CrossModel, CrossConfig
+from .module_decoder import DecoderModel, DecoderConfig
+from utils.module_clip import CLIP, convert_weights
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
+def update_attr(target_name, target_config, target_attr_name, source_config, source_attr_name, default_value=None):
+    if hasattr(source_config, source_attr_name):
+        if default_value is None or getattr(source_config, source_attr_name) != default_value:
+            setattr(target_config, target_attr_name, getattr(source_config, source_attr_name))
+    return target_config
+class CLIP4IDCPreTrainedModel(PreTrainedModel, nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, cross_config, decoder_config, *inputs, **kwargs):
+        super(CLIP4IDCPreTrainedModel, self).__init__(cross_config, decoder_config)
+        self.cross_config = cross_config
+        self.decoder_config = decoder_config
+        self.clip = None
+        self.cross = None
+    @classmethod
+    def from_pretrained(cls, cross_model_name, decoder_model_name, state_dict=None, cache_dir=None, type_vocab_size=2, *inputs, **kwargs):
+        if state_dict is None: state_dict = {}
+        pretrained_clip_name = "ViT-B/16"
+        clip_state_dict = CLIP.get_config(pretrained_clip_name=pretrained_clip_name)
+        for key, val in clip_state_dict.items():
+            new_key = "clip." + key
+            if new_key not in state_dict:
+                state_dict[new_key] = val.clone()
+        cross_config, _ = CrossConfig.get_config(cross_model_name, cache_dir, type_vocab_size, state_dict=None)
+        decoder_config, _ = DecoderConfig.get_config(decoder_model_name, cache_dir, type_vocab_size, state_dict=None)
+        model = cls(cross_config, decoder_config, clip_state_dict, *inputs, **kwargs)
+        ## ===> Initialization trick [HARD CODE]
+        if model.linear_patch == "3d":
+            contain_conv2 = False
+            for key in state_dict.keys():
+                if key.find("visual.conv2.weight") > -1:
+                    contain_conv2 = True
+                    break
+            if contain_conv2 is False and hasattr(model.clip.visual, "conv2"):
+                cp_weight = state_dict["clip.visual.conv1.weight"].clone()
+                kernel_size = model.clip.visual.conv2.weight.size(2)
+                conv2_size = model.clip.visual.conv2.weight.size()
+                conv2_size = list(conv2_size)
+                left_conv2_size = conv2_size.copy()
+                right_conv2_size = conv2_size.copy()
+                left_conv2_size[2] = (kernel_size - 1) // 2
+                right_conv2_size[2] = kernel_size - 1 - left_conv2_size[2]
+                left_zeros, right_zeros = None, None
+                if left_conv2_size[2] > 0:
+                    left_zeros = torch.zeros(*tuple(left_conv2_size), dtype=cp_weight.dtype, device=cp_weight.device)
+                if right_conv2_size[2] > 0:
+                    right_zeros = torch.zeros(*tuple(right_conv2_size), dtype=cp_weight.dtype, device=cp_weight.device)
+                cat_list = []
+                if left_zeros != None: cat_list.append(left_zeros)
+                cat_list.append(cp_weight.unsqueeze(2))
+                if right_zeros != None: cat_list.append(right_zeros)
+                cp_weight = torch.cat(cat_list, dim=2)
+                state_dict["clip.visual.conv2.weight"] = cp_weight
+        ## <=== End of initialization trick
+        if state_dict is not None:
+            model = cls.init_preweight(model, state_dict)
+        return model
+class CLIP4IDC(CLIP4IDCPreTrainedModel):
+    def __init__(self, cross_config, decoder_config, clip_state_dict):
+        super(CLIP4IDC, self).__init__(cross_config, decoder_config)
+        self.ignore_video_index = -1
+        # assert self.task_config.max_words <= cross_config.max_position_embeddings
+        # CLIP Encoders: From OpenAI: CLIP [https://github.com/openai/CLIP] ===>
+        vit = "visual.proj" in clip_state_dict
+        assert vit
+        if vit:
+            vision_width = clip_state_dict["visual.conv1.weight"].shape[0]
+            vision_layers = len(
+                [k for k in clip_state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+            vision_patch_size = clip_state_dict["visual.conv1.weight"].shape[-1]
+            grid_size = round((clip_state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+            image_resolution = vision_patch_size * grid_size
+        else:
+            counts: list = [len(set(k.split(".")[2] for k in clip_state_dict if k.startswith(f"visual.layer{b}"))) for b in
+                            [1, 2, 3, 4]]
+            vision_layers = tuple(counts)
+            vision_width = clip_state_dict["visual.layer1.0.conv1.weight"].shape[0]
+            output_width = round((clip_state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+            vision_patch_size = None
+            assert output_width ** 2 + 1 == clip_state_dict["visual.attnpool.positional_embedding"].shape[0]
+            image_resolution = output_width * 32
+        embed_dim = clip_state_dict["text_projection"].shape[1]
+        context_length = clip_state_dict["positional_embedding"].shape[0]
+        vocab_size = clip_state_dict["token_embedding.weight"].shape[0]
+        transformer_width = clip_state_dict["ln_final.weight"].shape[0]
+        transformer_heads = transformer_width // 64
+        transformer_layers = len(set(k.split(".")[2] for k in clip_state_dict if k.startswith(f"transformer.resblocks")))
+        self.linear_patch = '2d'
+        # use .float() to avoid overflow/underflow from fp16 weight. https://github.com/openai/CLIP/issues/40
+        cut_top_layer = 0
+        self.clip = CLIP(
+            embed_dim,
+            image_resolution, vision_layers-cut_top_layer, vision_width, vision_patch_size,
+            context_length, vocab_size, transformer_width, transformer_heads, transformer_layers-cut_top_layer,
+            linear_patch=self.linear_patch, intra_layers=9
+        ).float()
+        bert_word_embeddings_weight = self.clip.token_embedding.weight
+        bert_position_embeddings_weight = self.clip.positional_embedding
+        for key in ["input_resolution", "context_length", "vocab_size"]:
+            if key in clip_state_dict:
+                del clip_state_dict[key]
+        convert_weights(self.clip)
+        # <=== End of CLIP Encoders
+        self.decoder = DecoderModel(decoder_config, bert_word_embeddings_weight, bert_position_embeddings_weight)
+        self.apply(self.init_weights)
+    def get_visual_output(self, video, visual_mask, left_gt_map, right_gt_map, shaped=False, video_frame=-1):
+        bs_pair = visual_mask.size(0)
+        visual_hidden, visual_output, left_map, right_map = self.clip.encode_image(video, left_gt_map, right_gt_map, video_frame=video_frame, return_hidden=True)
+        visual_hidden = visual_hidden.float()
+        visual_output = visual_output.float()
+        visual_hidden = visual_hidden.view(bs_pair, -1, visual_hidden.size(-1))
+        left_map = left_map.float()
+        right_map = right_map.float()
+        return visual_hidden, visual_output, left_map, right_map
+    def get_sequence_visual_output(self, video, visual_mask, left_gt_map, right_gt_map, shaped=False, video_frame=-1):
+        if shaped is False:
+            visual_mask = visual_mask.view(-1, visual_mask.shape[-1])
+            video = torch.as_tensor(video).float()
+            b, pair, channel, h, w = video.shape
+            video = video.view(b * pair, channel, h, w)
+            video_frame = pair
+        _, visual_hidden, left_map, right_map = self.get_visual_output(video, visual_mask, left_gt_map, right_gt_map, shaped=True, video_frame=video_frame)
+        return visual_hidden, left_map, right_map
+    def _get_decoder_score(self, visual_output, visual_mask, input_caption_ids, decoder_mask):
+        res_tuples = ()
+        decoder_scores = self.decoder(input_caption_ids, encoder_outs=visual_output, answer_mask=decoder_mask, encoder_mask=visual_mask)
+        return decoder_scores, res_tuples
+    def decoder_caption(self, visual_output, visual_mask, input_caption_ids, decoder_mask, get_logits=False):
+        decoder_scores, _ = self._get_decoder_score(visual_output, visual_mask,
+                                                    input_caption_ids, decoder_mask)
+        if get_logits:
+            return decoder_scores
+        _, decoder_scores_result = torch.max(decoder_scores, -1)
+        return decoder_scores_result
+def init_model(model_path, device):
+    model_state_dict = torch.load(model_path, map_location='cpu')
+    # Prepare model
+    cache_dir = ""
+    model = CLIP4IDC.from_pretrained("cross-base", "decoder-base", cache_dir=cache_dir, state_dict=model_state_dict)
+    model.to(device)
+    return model

utils/module_clip.py ADDED Viewed

	@@ -0,0 +1,679 @@

+"""
+Adapted from: https://github.com/openai/CLIP/blob/main/clip/clip.py
+"""
+import warnings
+from collections import OrderedDict
+from typing import Tuple, Union, Optional
+import hashlib
+import os
+import urllib
+import warnings
+from tqdm import tqdm
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
+from torch.nn.init import xavier_uniform_
+from torch.nn.init import constant_
+from torch.nn.init import xavier_normal_
+from torch.nn.parameter import Parameter
+from torch.nn.modules.module import Module
+from .module_gated_attention import gated_coattention
+from torch import nn
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+}
+_PT_NAME = {
+    "RN50": "RN50.pt",
+    "RN101": "RN101.pt",
+    "RN50x4": "RN50x4.pt",
+    "RN50x16": "RN50x16.pt",
+    "ViT-B/32": "ViT-B-32.pt",
+    "ViT-B/16": "ViT-B-16.pt",
+}
+def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def available_models():
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+# =============================
+class TABAttention(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+    Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set
+    to :attr:`embed_dim` such that query, key, and value have the same
+    number of features.
+    Examples::
+        >>> multihead_attn = TABAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    This is a version of multihead attention written to comply with the defintion of TAB!!!
+    """
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(TABAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        if self._qkv_same_embed_dim is False:
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.register_parameter('in_proj_weight', None)
+        else:
+            self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+            self.register_parameter('q_proj_weight', None)
+            self.register_parameter('k_proj_weight', None)
+            self.register_parameter('v_proj_weight', None)
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self._reset_parameters()
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+    def __setstate__(self, state):
+        # Support loading old TABAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+        super(TABAttention, self).__setstate__(state)
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, gt_attention_map: Optional[Tensor] = None, key_padding_mask: Optional[Tensor] = None,
+                need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    Shapes for inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: if a 2D mask: :math:`(L, S)` where L is the target sequence length, S is the
+          source sequence length.
+          If a 3D mask: :math:`(N\cdot\text{num\_heads}, L, S)` where N is the batch size, L is the target sequence
+          length, S is the source sequence length. ``attn_mask`` ensure that position i is allowed to attend
+          the unmasked positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+    Shapes for outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return gated_coattention(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight.half(), self.in_proj_bias.half(),
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight.half(), self.out_proj.bias.half(),
+                training=self.training, gt_attention_map=gt_attention_map,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight)
+        else:
+            return gated_coattention(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight.half(), self.in_proj_bias.half(),
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight.half(), self.out_proj.bias.half(),
+                training=self.training, gt_attention_map=gt_attention_map,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask)
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask=None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        attn_mask_ = self.attn_mask
+        if self.attn_mask is not None and hasattr(self.attn_mask, '__call__'):
+            attn_mask_ = self.attn_mask(x.size(0))   # LND
+        attn_mask_ = attn_mask_.to(dtype=x.dtype, device=x.device) if attn_mask_ is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask_)[0]
+    def forward(self, x_tuple:tuple):
+        x, video_frame = x_tuple
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return (x, video_frame)
+    def visualize_attention(self, x: torch.Tensor):
+        attn_outputs, attn_weights = self.attn(x, x, x, need_weights=True, attn_mask=None)
+        return attn_outputs, attn_weights
+    def visualize_forward(self, x_tuple:tuple):
+        x, video_frame = x_tuple
+        attn_outputs, attn_weights = self.visualize_attention(self.ln_1(x))
+        x = x + attn_outputs
+        x = x + self.mlp(self.ln_2(x))
+        return (x, video_frame, attn_weights)
+class TABLayer(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask=None):
+        super().__init__()
+        self.attn = TABAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor, y: torch.Tensor):
+        attn_mask_ = self.attn_mask
+        if self.attn_mask is not None and hasattr(self.attn_mask, '__call__'):
+            attn_mask_ = self.attn_mask(x.size(0))   # LND
+        attn_mask_ = attn_mask_.to(dtype=x.dtype, device=x.device) if attn_mask_ is not None else None
+        return self.attn(x, y, y, need_weights=False, attn_mask=attn_mask_)[0]
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        x = self.attention(self.ln_1(x), self.ln_1(y))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+    def visualize_attention(self, x: torch.Tensor, y: torch.Tensor, gt_attention_map):
+        attn_outputs, attn_weights = self.attn(x, y, y, gt_attention_map=gt_attention_map, need_weights=True, attn_mask=None)
+        return attn_outputs, attn_weights
+    def visualize_forward(self, x: torch.Tensor, y: torch.Tensor, gt_attention_map):
+        attn_outputs, attn_weights = self.visualize_attention(self.ln_1(x), self.ln_1(y), gt_attention_map)
+        x = attn_outputs
+        x = x + self.mlp(self.ln_2(x))
+        return (x, attn_weights)
+class visionTransformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) if i < (layers - 1) else TABLayer(width, 1, attn_mask) for i in range(layers)])
+    def forward(self, x: torch.Tensor, video_frame=-1):
+        return self.resblocks((x, video_frame))[0]
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor, video_frame=-1):
+        return self.resblocks((x, video_frame))[0]
+class VisualTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int,
+                 linear_patch: str = '2d', intra_layers: int = 9):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.intra_layers = intra_layers
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.joint_positional_embedding = nn.Parameter(scale * torch.randn(2 * ((input_resolution // patch_size) ** 2 + 1), width))
+        self.bef_embedding = nn.Parameter(scale * torch.randn(width))
+        self.aft_embedding = nn.Parameter(scale * torch.randn(width))
+        self.ln_mid = LayerNorm(width)
+        self.transformer = visionTransformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+        # For 3D
+        assert linear_patch in ['2d', '3d']
+        self.linear_patch = linear_patch
+        if self.linear_patch == '3d':
+            self.conv2 = nn.Conv3d(in_channels=3, out_channels=width, kernel_size=(3, patch_size, patch_size),
+                                   stride=(1, patch_size, patch_size), padding=(1, 0, 0), bias=False)
+    def forward(self, x: torch.Tensor, left_gt_map, right_gt_map, video_frame=-1, visualize=False):
+        if self.linear_patch == '3d':
+            assert video_frame != -1
+            x_3d = x.reshape(-1, video_frame, x.shape[-3], x.shape[-2], x.shape[-1])
+            x_3d = x_3d.permute(0, 2, 1, 3, 4)
+            x_3d = self.conv2(x_3d)     # shape = [*, width, frame, grid, grid]
+            x_3d = x_3d.permute(0, 2, 1, 3, 4)      # shape = [*, frame, width, grid, grid]
+            x = x_3d.reshape(-1, x_3d.shape[-3], x_3d.shape[-2], x_3d.shape[-1]).contiguous() # shape = [*, width, grid, grid]
+        else:
+            x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        if visualize is True:
+            all_attn_weights = []
+            for i in range(self.intra_layers):
+                x, _, attn_weights = self.transformer.resblocks[i].visualize_forward((x, video_frame))
+                attn_weights = attn_weights.view(x.size(1) // video_frame, -1, attn_weights.size(-2),
+                                                 attn_weights.size(-1))
+                all_attn_weights.append(attn_weights)
+        else:
+            for i in range(self.intra_layers):
+                x = self.transformer.resblocks[i]((x, video_frame))[0]
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        bs = x.size(0) // video_frame
+        x = x.view(bs, video_frame, x.size(-2), x.size(-1))
+        x = torch.cat([x[:, 0] + self.bef_embedding.to(x.dtype),
+                       x[:, 1] + self.aft_embedding.to(x.dtype)], dim=1)
+        x = x + self.joint_positional_embedding.to(x.dtype)
+        x = self.ln_mid(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        if visualize is True:
+            for i in range(self.intra_layers, self.transformer.layers - 1):
+                x, _, attn_weights = self.transformer.resblocks[i].visualize_forward((x, video_frame))
+                all_attn_weights.append(attn_weights)
+            cls_index = int(x.size(0) / 2)
+            left_features, left_attn_weights = self.transformer.resblocks[-1].visualize_forward(x[:cls_index, :, :], x[cls_index:, :, :], right_gt_map)
+            right_features, right_attn_weights = self.transformer.resblocks[-1].visualize_forward(x[cls_index:, :, :], x[:cls_index, :, :], left_gt_map)
+            all_attn_weights.append(left_attn_weights)
+            all_attn_weights.append(right_attn_weights)
+        else:
+            for i in range(self.intra_layers, self.transformer.layers - 1):
+                x = self.transformer.resblocks[i]((x, video_frame))[0]
+            cls_index = int(x.size(0) / 2)
+            left_features, left_attn_weights = self.transformer.resblocks[-1].visualize_forward(x[:cls_index, :, :], x[cls_index:, :, :], right_gt_map)
+            right_features, right_attn_weights = self.transformer.resblocks[-1].visualize_forward(x[cls_index:, :, :], x[:cls_index, :, :], left_gt_map)
+        left_features = left_features.permute(1, 0, 2)  # LND -> NLD
+        right_features = right_features.permute(1, 0, 2)  # LND -> NLD
+        x = torch.cat([left_features, right_features], 1)
+        # Move the three lines below to `encode_image` for entire hidden sequence
+        # x = self.ln_post(x[:, 0, :])
+        # if self.proj is not None:
+        #     x = x @ self.proj
+        if visualize is True:
+            return x, all_attn_weights
+        return x, left_attn_weights, right_attn_weights
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 # vision linear of patch
+                 linear_patch: str = '2d',
+                 intra_layers: int = 9,
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        vision_heads = vision_width // 64
+        self.visual = VisualTransformer(
+            input_resolution=image_resolution,
+            patch_size=vision_patch_size,
+            width=vision_width,
+            layers=vision_layers,
+            heads=vision_heads,
+            output_dim=embed_dim,
+            linear_patch=linear_patch,
+            intra_layers=intra_layers,
+        )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    @staticmethod
+    def get_config(pretrained_clip_name="ViT-B/32"):
+        model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ViT-B-32.pt")
+        if pretrained_clip_name in _MODELS and pretrained_clip_name in _PT_NAME:
+            model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), _PT_NAME[pretrained_clip_name])
+        if pretrained_clip_name in ["ViT-B/32", "ViT-B/16"] and os.path.exists(model_path):
+            pass
+        else:
+            if pretrained_clip_name in _MODELS:
+                model_path = _download(_MODELS[pretrained_clip_name])
+            elif os.path.isfile(pretrained_clip_name):
+                model_path = pretrained_clip_name
+            else:
+                raise RuntimeError(f"Model {pretrained_clip_name} not found; available models = {available_models()}")
+        try:
+            # loading JIT archive
+            model = torch.jit.load(model_path, map_location="cpu").eval()
+            state_dict = model.state_dict()
+        except RuntimeError:
+            state_dict = torch.load(model_path, map_location="cpu")
+        return state_dict
+    def build_attention_mask(self, context_length):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.zeros(context_length, context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image, left_gt_map, right_gt_map, return_hidden=False, video_frame=-1):
+        hidden, left_map, right_map = self.visual(image.type(self.dtype), left_gt_map, right_gt_map, video_frame=video_frame)
+        hidden = self.visual.ln_post(hidden) @ self.visual.proj
+        cls_index = int(hidden.size(1) / 2)
+        hidden2 = torch.cat([hidden[:, 0, :].unsqueeze(1), hidden[:, cls_index, :].unsqueeze(1)], 1)
+        x = torch.mean(hidden2, 1)
+        if return_hidden:
+            return x, hidden2, left_map, right_map
+        return x, left_map, right_map
+    def encode_text(self, text, return_hidden=False):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        pos_emd = self.positional_embedding[:x.size(1), :].type(self.dtype)
+        x = x + pos_emd
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        hidden = self.ln_final(x).type(self.dtype) @ self.text_projection
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = hidden[torch.arange(hidden.shape[0]), text.argmax(dim=-1)]
+        if return_hidden:
+            return x, hidden
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()

utils/module_cross.py ADDED Viewed

	@@ -0,0 +1,394 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import copy
+import json
+import math
+import logging
+import tarfile
+import tempfile
+import shutil
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .file_utils import cached_path
+from .until_config import PretrainedConfig
+from .until_module import PreTrainedModel, LayerNorm, ACT2FN
+logger = logging.getLogger(__name__)
+PRETRAINED_MODEL_ARCHIVE_MAP = {}
+CONFIG_NAME = 'cross_config.json'
+WEIGHTS_NAME = 'cross_pytorch_model.bin'
+class CrossConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `CrossModel`.
+    """
+    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    config_name = CONFIG_NAME
+    weights_name = WEIGHTS_NAME
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs CrossConfig.
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CrossModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `CrossModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+class CrossEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(CrossEmbeddings, self).__init__()
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, concat_embeddings, concat_type=None):
+        batch_size, seq_length = concat_embeddings.size(0), concat_embeddings.size(1)
+        if concat_type is None:
+            concat_type = torch.zeros(batch_size, concat_type).to(concat_embeddings.device)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=concat_embeddings.device)
+        position_ids = position_ids.unsqueeze(0).expand(concat_embeddings.size(0), -1)
+        token_type_embeddings = self.token_type_embeddings(concat_type)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = concat_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class CrossSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(CrossSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in CrossModel forward() function)
+        attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class CrossSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(CrossSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super(CrossAttention, self).__init__()
+        self.self = CrossSelfAttention(config)
+        self.output = CrossSelfOutput(config)
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+class CrossIntermediate(nn.Module):
+    def __init__(self, config):
+        super(CrossIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class CrossOutput(nn.Module):
+    def __init__(self, config):
+        super(CrossOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class CrossLayer(nn.Module):
+    def __init__(self, config):
+        super(CrossLayer, self).__init__()
+        self.attention = CrossAttention(config)
+        self.intermediate = CrossIntermediate(config)
+        self.output = CrossOutput(config)
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class CrossEncoder(nn.Module):
+    def __init__(self, config):
+        super(CrossEncoder, self).__init__()
+        layer = CrossLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+class CrossPooler(nn.Module):
+    def __init__(self, config):
+        super(CrossPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class CrossPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(CrossPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class CrossLMPredictionHead(nn.Module):
+    def __init__(self, config, cross_model_embedding_weights):
+        super(CrossLMPredictionHead, self).__init__()
+        self.transform = CrossPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(cross_model_embedding_weights.size(1),
+                                 cross_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = cross_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(cross_model_embedding_weights.size(0)))
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+class CrossOnlyMLMHead(nn.Module):
+    def __init__(self, config, cross_model_embedding_weights):
+        super(CrossOnlyMLMHead, self).__init__()
+        self.predictions = CrossLMPredictionHead(config, cross_model_embedding_weights)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class CrossOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(CrossOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class CrossPreTrainingHeads(nn.Module):
+    def __init__(self, config, cross_model_embedding_weights):
+        super(CrossPreTrainingHeads, self).__init__()
+        self.predictions = CrossLMPredictionHead(config, cross_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class CrossModel(PreTrainedModel):
+    def __init__(self, config):
+        super(CrossModel, self).__init__(config)
+        self.embeddings = CrossEmbeddings(config)
+        self.encoder = CrossEncoder(config)
+        self.pooler = CrossPooler(config)
+        self.apply(self.init_weights)
+    def forward(self, concat_input, concat_type=None, attention_mask=None, output_all_encoded_layers=True):
+        if attention_mask is None:
+            attention_mask = torch.ones(concat_input.size(0), concat_input.size(1))
+        if concat_type is None:
+            concat_type = torch.zeros_like(attention_mask)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        embedding_output = self.embeddings(concat_input, concat_type)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output

utils/module_decoder.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import copy
+import json
+import math
+import logging
+import tarfile
+import tempfile
+import shutil
+import numpy as np
+import torch
+from torch import nn
+from .file_utils import cached_path
+from .until_config import PretrainedConfig
+from .until_module import PreTrainedModel, LayerNorm, ACT2FN
+logger = logging.getLogger(__name__)
+PRETRAINED_MODEL_ARCHIVE_MAP = {}
+CONFIG_NAME = 'decoder_config.json'
+WEIGHTS_NAME = 'decoder_pytorch_model.bin'
+class DecoderConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `DecoderModel`.
+    """
+    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    config_name = CONFIG_NAME
+    weights_name = WEIGHTS_NAME
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 max_target_embeddings=128,
+                 num_decoder_layers=1):
+        """Constructs DecoderConfig.
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `DecoderModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `DecoderModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            max_target_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            num_decoder_layers:
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.max_target_embeddings = max_target_embeddings
+            self.num_decoder_layers = num_decoder_layers
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = ACT2FN[config.hidden_act] \
+            if isinstance(config.hidden_act, str) else config.hidden_act
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, decoder_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(decoder_model_embedding_weights.size(1),
+                                 decoder_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = decoder_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(decoder_model_embedding_weights.size(0)))
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config, decoder_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config, decoder_model_embedding_weights)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class MultiHeadAttention(nn.Module):
+    ''' Multi-Head Attention module '''
+    def __init__(self, config):
+        super(MultiHeadAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, q, k, v, attention_mask):
+        mixed_query_layer = self.query(q)
+        mixed_key_layer = self.key(k)
+        mixed_value_layer = self.value(v)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer, attention_scores
+class PositionwiseFeedForward(nn.Module):
+    ''' A two-feed-forward-layer module '''
+    def __init__(self, d_in, d_hid, dropout=0.1):
+        super().__init__()
+        self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise
+        self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise
+        self.layer_norm = nn.LayerNorm(d_in)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        output = x.transpose(1, 2)
+        output = self.w_2(ACT2FN["gelu"](self.w_1(output)))
+        output = output.transpose(1, 2)
+        output = self.dropout(output)
+        output = self.layer_norm(output + residual)
+        return output
+class DecoderAttention(nn.Module):
+    def __init__(self, config):
+        super(DecoderAttention, self).__init__()
+        self.att = MultiHeadAttention(config)
+        self.output = BertSelfOutput(config)
+    def forward(self, q, k, v, attention_mask):
+        att_output, attention_probs = self.att(q, k, v, attention_mask)
+        attention_output = self.output(att_output, q)
+        return attention_output, attention_probs
+class EncoderLayer(nn.Module):
+    def __init__(self, config):
+        super(EncoderLayer, self).__init__()
+        self.slf_attn = DecoderAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, dec_input, slf_attn_mask=None):
+        slf_output, slf_att_scores = self.slf_attn(dec_input, dec_input, dec_input, slf_attn_mask)
+        intermediate_output = self.intermediate(slf_output)
+        dec_output = self.output(intermediate_output, slf_output)
+        return dec_output, slf_att_scores
+class DecoderLayer(nn.Module):
+    def __init__(self, config):
+        super(DecoderLayer, self).__init__()
+        self.slf_attn = DecoderAttention(config)
+        self.enc_attn = DecoderAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, dec_input, enc_output, slf_attn_mask=None, dec_enc_attn_mask=None):
+        slf_output, _ = self.slf_attn(dec_input, dec_input, dec_input, slf_attn_mask)
+        dec_output, dec_att_scores = self.enc_attn(slf_output, enc_output, enc_output, dec_enc_attn_mask)
+        intermediate_output = self.intermediate(dec_output)
+        dec_output = self.output(intermediate_output, dec_output)
+        return dec_output, dec_att_scores
+class DecoderEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config, decoder_word_embeddings_weight, decoder_position_embeddings_weight):
+        super(DecoderEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_target_embeddings, config.hidden_size)
+        self.word_embeddings.weight = decoder_word_embeddings_weight
+        self.position_embeddings.weight = decoder_position_embeddings_weight
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = LayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class Encoder(nn.Module):
+    def __init__(self, config):
+        super(Encoder, self).__init__()
+        layer = EncoderLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_decoder_layers)])
+    def forward(self, hidden_states, self_attn_mask, output_all_encoded_layers=False):
+        dec_att_scores = None
+        all_encoder_layers = []
+        all_dec_att_probs = []
+        for layer_module in self.layer:
+            hidden_states, dec_att_scores = layer_module(hidden_states, self_attn_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+                all_dec_att_probs.append(dec_att_scores)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+            all_dec_att_probs.append(dec_att_scores)
+        return all_encoder_layers, all_dec_att_probs
+class Decoder(nn.Module):
+    def __init__(self, config):
+        super(Decoder, self).__init__()
+        layer = DecoderLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_decoder_layers)])
+    def forward(self, hidden_states, encoder_outs, self_attn_mask, attention_mask, output_all_encoded_layers=False):
+        dec_att_scores = None
+        all_encoder_layers = []
+        all_dec_att_probs = []
+        for i, layer_module in enumerate(self.layer):
+            if isinstance(encoder_outs, list):
+                hidden_states, dec_att_scores = layer_module(hidden_states, encoder_outs[i], self_attn_mask, attention_mask)
+            else:
+                hidden_states, dec_att_scores = layer_module(hidden_states, encoder_outs, self_attn_mask, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+                all_dec_att_probs.append(dec_att_scores)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+            all_dec_att_probs.append(dec_att_scores)
+        return all_encoder_layers, all_dec_att_probs
+class DecoderClassifier(nn.Module):
+    def __init__(self, config, embedding_weights):
+        super(DecoderClassifier, self).__init__()
+        self.cls = BertOnlyMLMHead(config, embedding_weights)
+    def forward(self, hidden_states):
+        cls_scores = self.cls(hidden_states)
+        return cls_scores
+class DecoderModel(PreTrainedModel):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        final_norm (bool, optional): apply layer norm to the output of the
+            final decoder layer (default: True).
+    """
+    def __init__(self, config, decoder_word_embeddings_weight, decoder_position_embeddings_weight):
+        super(DecoderModel, self).__init__(config)
+        self.config = config
+        self.max_target_length = config.max_target_embeddings
+        self.embeddings = DecoderEmbeddings(config, decoder_word_embeddings_weight, decoder_position_embeddings_weight)
+        self.decoder = Decoder(config)
+        self.encoder = Encoder(config)
+        self.classifier = DecoderClassifier(config, decoder_word_embeddings_weight)
+        self.apply(self.init_weights)
+    def forward(self, input_ids, encoder_outs=None, answer_mask=None, encoder_mask=None):
+        """
+        Args:
+            input_ids (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing
+            encoder_outs (Tensor, optional): output from the encoder, used for encoder-side attention
+        Returns:
+            tuple:
+                - the last decoder layer's output of shape `(batch, tgt_len, vocab)`
+                - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)`
+        """
+        embedding_output = self.embeddings(input_ids)
+        extended_encoder_mask = encoder_mask.unsqueeze(1).unsqueeze(2)   # b x 1 x 1 x ls
+        extended_encoder_mask = extended_encoder_mask.to(dtype=self.dtype) # fp16 compatibility
+        extended_encoder_mask = (1.0 - extended_encoder_mask) * -10000.0
+        extended_answer_mask = answer_mask.unsqueeze(1).unsqueeze(2)
+        extended_answer_mask = extended_answer_mask.to(dtype=self.dtype)  # fp16 compatibility
+        sz_b, len_s, _ = embedding_output.size()
+        subsequent_mask = torch.triu(torch.ones((len_s, len_s), device=embedding_output.device, dtype=embedding_output.dtype), diagonal=1)
+        self_attn_mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1).unsqueeze(1)  # b x 1 x ls x ls
+        slf_attn_mask = ((1.0 - extended_answer_mask) + self_attn_mask).gt(0).to(dtype=self.dtype)
+        self_attn_mask = slf_attn_mask * -10000.0
+        encoder_outs, _ = self.encoder(encoder_outs, extended_encoder_mask, output_all_encoded_layers=True)
+        # encoder_outs = encoder_outs[-1]
+        decoded_layers, dec_att_scores = self.decoder(embedding_output,
+                                      encoder_outs,
+                                      self_attn_mask,
+                                      extended_encoder_mask,
+                                      )
+        sequence_output = decoded_layers[-1]
+        cls_scores = self.classifier(sequence_output)
+        return cls_scores

utils/module_gated_attention.py ADDED Viewed

	@@ -0,0 +1,301 @@

+r"""Gated Co-attention interface"""
+from typing import Callable, List, Optional, Tuple
+import math
+import warnings
+import torch
+from torch.nn.functional import *
+def gated_coattention(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Tensor,
+    in_proj_bias: Tensor,
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Tensor,
+    training: bool = True,
+    gt_attention_map: Optional[Tensor] = None,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+          will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+    tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
+    if has_torch_function(tens_ops):
+        return handle_torch_function(
+            multi_head_attention_forward,
+            tens_ops,
+            query,
+            key,
+            value,
+            embed_dim_to_check,
+            num_heads,
+            in_proj_weight,
+            in_proj_bias,
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            dropout_p,
+            out_proj_weight,
+            out_proj_bias,
+            training=training,
+            gt_attention_map=gt_attention_map,
+            key_padding_mask=key_padding_mask,
+            need_weights=need_weights,
+            attn_mask=attn_mask,
+            use_separate_proj_weight=use_separate_proj_weight,
+            q_proj_weight=q_proj_weight,
+            k_proj_weight=k_proj_weight,
+            v_proj_weight=v_proj_weight,
+            static_k=static_k,
+            static_v=static_v,
+        )
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+    if not use_separate_proj_weight:
+        # encoder-decoder attention ---->>>>>>>> co-attention style
+        # This is inline in_proj function with in_proj_weight and in_proj_bias
+        _b = in_proj_bias
+        _start = 0
+        _end = embed_dim
+        _w = in_proj_weight[_start:_end, :]
+        if _b is not None:
+            _b = _b[_start:_end]
+        q = linear(query, _w, _b)
+        if key is None:
+            assert value is None
+            k = None
+            v = None
+        else:
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            k, v = linear(key, _w, _b).chunk(2, dim=-1)
+    else:
+        q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+        k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+        v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+        if in_proj_bias is not None:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim : (embed_dim * 2)])
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2) :])
+        else:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+    if attn_mask is not None:
+        assert (
+            attn_mask.dtype == torch.float32
+            or attn_mask.dtype == torch.float64
+            or attn_mask.dtype == torch.float16
+            or attn_mask.dtype == torch.uint8
+            or attn_mask.dtype == torch.bool
+        ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError("The size of the 2D attn_mask is not correct.")
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError("The size of the 3D attn_mask is not correct.")
+        else:
+            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn(
+            "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+        )
+        key_padding_mask = key_padding_mask.to(torch.bool)
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+    src_len = k.size(1)
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float("-inf"))
+        else:
+            attn_output_weights += attn_mask
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float("-inf"),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+    attn_output_weights = softmax(attn_output_weights, dim=-1)
+    attn_output_weights_new = attn_output_weights.clone()
+    attn_output_weights_new[:, 0, :] = gt_attention_map
+    attn_output_weights = attn_output_weights_new
+    ##################### TAB Forget Gate (start) #########################
+    # temp_weights = torch.ones_like(attn_output_weights[:, 0, 0], device=attn_output_weights.device)
+    # temp_weights = temp_weights - attn_output_weights[:, 0, 0]
+    # weights = torch.mul(attn_output_weights[:, 0, :], temp_weights[:, None])
+    # attn_output_weights_new = attn_output_weights.clone()  # Create a copy
+    # attn_output_weights_new[:, 0, :] = weights  # Modify the copy
+    ##################### TAB Forget Gate (end) #########################
+    attn_output_weights_new = dropout(attn_output_weights_new, p=dropout_p, training=training)
+    attn_output = torch.bmm(attn_output_weights_new, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None

utils/tokenization_clip.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+        self.vocab = self.encoder
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text
+    def tokenize(self, text):
+        tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return tokens
+    def convert_tokens_to_ids(self, tokens):
+        return [self.encoder[bpe_token] for bpe_token in tokens]
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in tokens using the vocab."""
+        return self.decode(ids)

utils/until_config.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import copy
+import json
+import logging
+import tarfile
+import tempfile
+import shutil
+import torch
+from .file_utils import cached_path
+logger = logging.getLogger(__name__)
+class PretrainedConfig(object):
+    pretrained_model_archive_map = {}
+    config_name = ""
+    weights_name = ""
+    @classmethod
+    def get_config(cls, pretrained_model_name, cache_dir, type_vocab_size, state_dict, task_config=None):
+        archive_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), pretrained_model_name)
+        if os.path.exists(archive_file) is False:
+            if pretrained_model_name in cls.pretrained_model_archive_map:
+                archive_file = cls.pretrained_model_archive_map[pretrained_model_name]
+            else:
+                archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            if task_config is None or task_config.local_rank == 0:
+                logger.error(
+                    "Model name '{}' was not found in model name list. "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name,
+                        archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            if task_config is None or task_config.local_rank == 0:
+                logger.info("loading archive file {}".format(archive_file))
+        else:
+            if task_config is None or task_config.local_rank == 0:
+                logger.info("loading archive file {} from cache at {}".format(
+                    archive_file, resolved_archive_file))
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            if task_config is None or task_config.local_rank == 0:
+                logger.info("extracting archive file {} to temp dir {}".format(
+                    resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, cls.config_name)
+        config = cls.from_json_file(config_file)
+        config.type_vocab_size = type_vocab_size
+        if task_config is None or task_config.local_rank == 0:
+            logger.info("Model config {}".format(config))
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, cls.weights_name)
+            if os.path.exists(weights_path):
+                state_dict = torch.load(weights_path, map_location='cpu')
+            else:
+                if task_config is None or task_config.local_rank == 0:
+                    logger.info("Weight doesn't exsits. {}".format(weights_path))
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return config, state_dict
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = cls(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+    def __repr__(self):
+        return str(self.to_json_string())
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

utils/until_module.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+import logging
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+import math
+from .until_config import PretrainedConfig
+logger = logging.getLogger(__name__)
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def swish(x):
+    return x * torch.sigmoid(x)
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+class LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(LayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+class PreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedModel, self).__init__()
+        if not isinstance(config, PretrainedConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, LayerNorm):
+            if 'beta' in dir(module) and 'gamma' in dir(module):
+                module.beta.data.zero_()
+                module.gamma.data.fill_(1.0)
+            else:
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def resize_token_embeddings(self, new_num_tokens=None):
+        raise NotImplementedError
+    @classmethod
+    def init_preweight(cls, model, state_dict, prefix=None):
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+        if prefix is not None:
+            old_keys = []
+            new_keys = []
+            for key in state_dict.keys():
+                old_keys.append(key)
+                new_keys.append(prefix + key)
+            for old_key, new_key in zip(old_keys, new_keys):
+                state_dict[new_key] = state_dict.pop(old_key)
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        load(model, prefix='')
+        return model
+    @property
+    def dtype(self):
+        """
+        :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        try:
+            return next(self.parameters()).dtype
+        except StopIteration:
+            # For nn.DataParallel compatibility in PyTorch 1.5
+            def find_tensor_attributes(module: nn.Module):
+                tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+                return tuples
+            gen = self._named_members(get_members_fn=find_tensor_attributes)
+            first_tuple = next(gen)
+            return first_tuple[1].dtype
+    @classmethod
+    def from_pretrained(cls, config, state_dict=None,  *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+        """
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            return model
+        model = cls.init_preweight(model, state_dict)
+        return model
+##################################
+###### LOSS FUNCTION #############
+##################################
+import itertools
+class GroupCEn(nn.Module):
+    def __init__(self,):
+        super(GroupCEn, self).__init__()
+    def forward(self, sim_matrix, target):
+        mask = torch.eye(sim_matrix.size()[0], device=sim_matrix.device)
+        indicies = torch.nonzero(target)
+        com = list(itertools.product(indicies, indicies))
+        for x, y in com:
+            mask[x, y] = 1.0
+        logpt = F.log_softmax(sim_matrix, dim=-1)
+        logpt = logpt * mask
+        logpt = torch.sum(logpt, dim=-1) / torch.sum(mask, dim=-1)
+        logpt = torch.diag(logpt)
+        nce_loss = -logpt
+        sim_loss = nce_loss.mean()
+        return sim_loss
+class CrossEn(nn.Module):
+    def __init__(self,):
+        super(CrossEn, self).__init__()
+    def forward(self, sim_matrix):
+        logpt = F.log_softmax(sim_matrix, dim=-1)
+        logpt = torch.diag(logpt)
+        nce_loss = -logpt
+        sim_loss = nce_loss.mean()
+        return sim_loss
+class MILNCELoss(nn.Module):
+    def __init__(self, batch_size=1, n_pair=1,):
+        super(MILNCELoss, self).__init__()
+        self.batch_size = batch_size
+        self.n_pair = n_pair
+        torch_v = float(".".join(torch.__version__.split(".")[:2]))
+        self.bool_dtype = torch.bool if torch_v >= 1.3 else torch.uint8
+    def forward(self, sim_matrix):
+        mm_mask = np.eye(self.batch_size)
+        mm_mask = np.kron(mm_mask, np.ones((self.n_pair, self.n_pair)))
+        mm_mask = torch.tensor(mm_mask).float().to(sim_matrix.device)
+        from_text_matrix = sim_matrix + mm_mask * -1e12
+        from_video_matrix = sim_matrix.transpose(1, 0)
+        new_sim_matrix = torch.cat([from_video_matrix, from_text_matrix], dim=-1)
+        logpt = F.log_softmax(new_sim_matrix, dim=-1)
+        mm_mask_logpt = torch.cat([mm_mask, torch.zeros_like(mm_mask)], dim=-1)
+        masked_logpt = logpt + (torch.ones_like(mm_mask_logpt) - mm_mask_logpt) * -1e12
+        new_logpt = -torch.logsumexp(masked_logpt, dim=-1)
+        logpt_choice = torch.zeros_like(new_logpt)
+        mark_ind = torch.arange(self.batch_size).to(sim_matrix.device) * self.n_pair + (self.n_pair//2)
+        logpt_choice[mark_ind] = 1
+        sim_loss = new_logpt.masked_select(logpt_choice.to(dtype=self.bool_dtype)).mean()
+        return sim_loss
+class MaxMarginRankingLoss(nn.Module):
+    def __init__(self,
+                 margin=1.0,
+                 negative_weighting=False,
+                 batch_size=1,
+                 n_pair=1,
+                 hard_negative_rate=0.5,
+        ):
+        super(MaxMarginRankingLoss, self).__init__()
+        self.margin = margin
+        self.n_pair = n_pair
+        self.batch_size = batch_size
+        easy_negative_rate = 1 - hard_negative_rate
+        self.easy_negative_rate = easy_negative_rate
+        self.negative_weighting = negative_weighting
+        if n_pair > 1 and batch_size > 1:
+            alpha = easy_negative_rate / ((batch_size - 1) * (1 - easy_negative_rate))
+            mm_mask = (1 - alpha) * np.eye(self.batch_size) + alpha
+            mm_mask = np.kron(mm_mask, np.ones((n_pair, n_pair)))
+            mm_mask = torch.tensor(mm_mask) * (batch_size * (1 - easy_negative_rate))
+            self.mm_mask = mm_mask.float()
+    def forward(self, x):
+        d = torch.diag(x)
+        max_margin = F.relu(self.margin + x - d.view(-1, 1)) + \
+                     F.relu(self.margin + x - d.view(1, -1))
+        if self.negative_weighting and self.n_pair > 1 and self.batch_size > 1:
+            max_margin = max_margin * self.mm_mask.to(max_margin.device)
+        return max_margin.mean()
+class AllGather(torch.autograd.Function):
+    """An autograd function that performs allgather on a tensor."""
+    @staticmethod
+    def forward(ctx, tensor, args):
+        output = [torch.empty_like(tensor) for _ in range(args.world_size)]
+        torch.distributed.all_gather(output, tensor)
+        ctx.rank = args.rank
+        ctx.batch_size = tensor.shape[0]
+        return torch.cat(output, dim=0)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (
+            grad_output[ctx.batch_size * ctx.rank : ctx.batch_size * (ctx.rank + 1)],
+            None,
+        )