File size: 2,270 Bytes
a0e37e2
 
 
 
 
 
 
 
 
 
9433533
a0e37e2
 
 
9433533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e37e2
 
c751e97
a0e37e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9433533
a0e37e2
 
 
 
 
 
 
9433533
c751e97
9433533
a0e37e2
9433533
a0e37e2
 
 
9433533
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from dataclasses import dataclass
from enum import Enum

import torch

from ask_candid.base.lambda_base import LambdaInvokeBase


@dataclass(slots=True)
class Encoding:
    inputs: list[str]
    vectors: torch.Tensor


@dataclass(slots=True)
class SummaryItem:
    rank: int
    score: float
    text: str


@dataclass(slots=True)
class TextSummary:
    snippets: list[SummaryItem]

    @property
    def summary(self) -> str:
        return ' '.join([_.text for _ in self.snippets])


class CandidSLM(LambdaInvokeBase):
    """Wrapper around Candid's custom small language model.
    For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
    This services includes:
        * text encoding
        * document summarization
        * entity salience estimation

    Parameters
    ----------
    access_key : Optional[str], optional
        AWS access key, by default None
    secret_key : Optional[str], optional
        AWS secret key, by default None
    """

    class Tasks(Enum):  # pylint: disable=missing-class-docstring
        ENCODE = "/encode"
        DOCUMENT_SUMMARIZE = "/document/summarize"
        DOCUMENT_NER_SALIENCE = "/document/entitySalience"

    def __init__(
        self, access_key: str | None = None, secret_key: str | None = None
    ) -> None:
        super().__init__(
            function_name="small-lm",
            access_key=access_key,
            secret_key=secret_key
        )

    def encode(self, text: list[str]) -> Encoding:
        response = self._submit_request({"text": text, "path": self.Tasks.ENCODE.value})
        assert isinstance(response, dict)

        return Encoding(
            inputs=(response.get("inputs") or []),
            vectors=torch.tensor((response.get("vectors") or []), dtype=torch.float32)
        )

    def summarize(self, text: list[str], top_k: int) -> TextSummary:
        response = self._submit_request({"text": text, "path": self.Tasks.DOCUMENT_SUMMARIZE.value})
        assert isinstance(response, dict)

        return TextSummary(
            snippets=[
                SummaryItem(rank=item["rank"], score=item["score"], text=item["value"])
                for item in (response.get("summary") or [])[:top_k]
            ]
        )