Skip to content

Code Reference

compute_fuzzy_matrix(strings, **kwargs)

Compute Fuzzy Matrix

Computes matrix with pairwise fuzzy ratios (=edit) distances between all strings.

The result can be thought of as a correlation matrix with all diagonal elements equal to 100.

Parameters:

Name Type Description Default
strings List[str]

strings for clustering.

required
kwargs

all optional arguments for rapidfuzz.process.cdist.

{}

Returns:

Type Description
pd.DataFrame

pairwise fuzzy ratios between strings.

Examples:

>>> person_names = ['Donald Trump', 'Donald Trump',
                    'J. biden', 'joe biden', 'Biden',
                    'Bide', 'mark esper',
                    'Christopher c . miller',
                    'jim mattis', 'Nancy Pelosi',
                    'trumps', 'Trump', 'Donald',
                    'miller']
....
Source code in fuzzup/fuzz.py
def compute_fuzzy_matrix(strings: List[str], **kwargs) -> pd.DataFrame:
    """Compute Fuzzy Matrix

    Computes matrix with pairwise fuzzy ratios (=edit)
    distances between all strings.

    The result can be thought of as a correlation
    matrix with all diagonal elements equal to 100.

    Args:
        strings (List[str]): strings for clustering.
        kwargs: all optional arguments for
            rapidfuzz.process.cdist.

    Returns:
        pd.DataFrame: pairwise fuzzy ratios between
            strings.

    Examples:
        >>> person_names = ['Donald Trump', 'Donald Trump',
                            'J. biden', 'joe biden', 'Biden',
                            'Bide', 'mark esper',
                            'Christopher c . miller',
                            'jim mattis', 'Nancy Pelosi',
                            'trumps', 'Trump', 'Donald',
                            'miller']
        ....

    """

    # subset unique strings.
    strings = list(set(strings))

    # compute edit distances
    dists = cdist(strings, strings, **kwargs)

    dists = pd.DataFrame(dists, index=strings, columns=strings)

    return dists

compute_prominence(clusters, to_dataframe=False, merge_output=True, weight_position=None, weight_multipliers=None)

Compute Prominence

Computes prominence of entity clusters.

Parameters:

Name Type Description Default
clusters List[Dict]

Entity clusters.

required
to_dataframe bool

Export output as pandas dataframe? Defaults to False.

False
merge_output bool

Merge resulting cluster meta data with input data. Defaults to True.

True
weight_position float

threshold for position-adjusted weight interpolation. Defaults to None implying no adjustment for positions in text.

None
weight_multipliers ndarray

weight multipliers.

None

Returns:

Type Description
List[Dict]

clusters and their prominence.

Examples:

...

Source code in fuzzup/fuzz.py
def compute_prominence(
    clusters: List[Dict],
    to_dataframe: bool = False,
    merge_output: bool = True,
    weight_position: float = None,
    weight_multipliers: np.ndarray = None,
) -> List[Dict]:
    """Compute Prominence

    Computes prominence of entity clusters.

    Args:
        clusters (List[Dict]): Entity clusters.
        to_dataframe (bool, optional): Export output
            as pandas dataframe? Defaults to False.
        merge_output (bool, optional): Merge resulting
            cluster meta data with input data. Defaults to True.
        weight_position: threshold for position-adjusted
            weight interpolation. Defaults to None implying
            no adjustment for positions in text.
        weight_multipliers: weight multipliers.

    Returns:
        List[Dict]: clusters and their prominence.
    Examples:
        ...
    """
    # handle trivial case (empty list)
    if len(clusters) == 0:
        if to_dataframe:
            return pd.DataFrame()
        else:
            return []

    # validate inputs
    if weight_position is not None:
        assert 0 <= weight_position <= 1, "choose 'weight_position' between 0 and 1"
    if weight_multipliers is not None:
        assert len(weight_multipliers) == len(
            clusters
        ), "Multipliers must have same length as number of entities"
    else:
        weight_multipliers = float(1)

    clusters = pd.DataFrame.from_dict(clusters)

    prominence = clusters.copy()
    prominence_score = float(1)

    # adjust prominence score for word positions (=offsets)
    if weight_position is not None and len(clusters.start) > 1:
        offset_min = min(clusters.start)
        offset_max = max(clusters.start)
        # linear interpolation
        xp = [offset_min, offset_max]
        yp = [1, weight_position]
        prominence_position = np.array([np.interp(x, xp, yp) for x in clusters.start])
    else:
        prominence_position = float(1)

    prominence_score = prominence_score * prominence_position * weight_multipliers

    # aggregate prominence to group level
    prominence["prominence_score"] = prominence_score
    prominence = prominence.groupby(CLUSTER_ID)["prominence_score"].sum()

    # rank clusters by prominence
    ranks = rankdata(prominence, method="max")
    ranks = max(ranks) - ranks + 1

    # organize output as data frame
    prominence = pd.DataFrame(prominence)
    prominence["prominence_rank"] = ranks
    prominence.reset_index(level=0, inplace=True)

    if merge_output:
        prominence = pd.merge(clusters, prominence, how="left")

    if not to_dataframe:
        prominence = prominence.to_dict(orient="records")

    return prominence

compute_prominence_bygroup(clusters, return_first_rank=False, **kwargs)

Compute Prominence by Group

Computes prominence by entity group. Simply calls compute_prominence() groupwise.

Parameters:

Name Type Description Default
clusters List[Dict]

Entity clusters.

required
kwargs

all optional arguments for compute_prominence().

{}

Returns:

Type Description
List[Dict]

entity clusters with prominence scores.

Source code in fuzzup/fuzz.py
def compute_prominence_bygroup(
    clusters: List[Dict], return_first_rank: bool = False, **kwargs
) -> List[Dict]:
    """Compute Prominence by Group

    Computes prominence by entity group. Simply
    calls compute_prominence() groupwise.

    Args:
        clusters (List[Dict]): Entity clusters.
        kwargs: all optional arguments for
            compute_prominence().

    Returns:
        List[Dict]: entity clusters with
            prominence scores.
    """
    # handle trivial case.
    if len(clusters) == 0:
        return []

    clusters = pd.DataFrame.from_dict(clusters)
    clusters = clusters.groupby(["entity_group"])

    out = [
        compute_prominence(
            clusters=clusters.get_group(group).to_dict(orient="records"), **kwargs
        )
        for group in clusters.groups
    ]
    out = flatten(out)

    # If you only want the most prominent entities returned, pop all entities that are not the most prominent
    if return_first_rank:
        out = [i for i in out if i["prominence_rank"] == 1]
    return out

compute_prominence_placement(clusters, placement_col='placement', wgt_body=1.0, wgt_lead=2.0, wgt_title=3.0, bygroup=False, **kwargs)

Compute Prominence from Article Placement

Parameters:

Name Type Description Default
clusters list

NER predictions.

required
placement_col str

Name of column containing article placement of entities. Defaults to "placement".

'placement'
wgt_body float

Weight of entities in body text. Defaults to 1.0.

1.0
wgt_lead float

Weight of entities in lead text. Defaults to 2.0.

2.0
wgt_title float

Weight of entities in title. Defaults to 3.0.

3.0
bygroup bool

use compute_prominence_bygroup() in stead of compute_prominence()? Defaults to True.

False
kwargs

all optional arguments for compute_prominence(bygroup).

{}

Returns:

Type Description
list

predictions with prominence scores.

Source code in fuzzup/fuzz.py
def compute_prominence_placement(
    clusters: list,
    placement_col: str = "placement",
    wgt_body: float = 1.0,
    wgt_lead: float = 2.0,
    wgt_title: float = 3.0,
    bygroup: bool = False,
    **kwargs,
) -> list:
    """Compute Prominence from Article Placement

    Args:
        clusters (list): NER predictions.
        placement_col (str, optional): Name of column containing article
            placement of entities. Defaults to "placement".
        wgt_body (float, optional): Weight of entities in body
            text. Defaults to 1.0.
        wgt_lead (float, optional): Weight of entities in lead
            text. Defaults to 2.0.
        wgt_title (float, optional): Weight of entities in title.
            Defaults to 3.0.
        bygroup (bool, optional): use compute_prominence_bygroup()
            in stead of compute_prominence()? Defaults to True.
        kwargs: all optional arguments for compute_prominence(bygroup).

    Returns:
        list: predictions with prominence scores.

    """

    if len(clusters) == 0:
        return []

    assert all(
        [placement_col in x for x in clusters]
    ), f"key {placement_col} must be present in all dicts"

    weights = {"body": wgt_body, "lead": wgt_lead, "title": wgt_title}

    multipliers = np.array([weights.get(x.get(placement_col)) for x in clusters])

    if bygroup:
        prominence_function = compute_prominence_bygroup
    else:
        prominence_function = compute_prominence

    clusters = prominence_function(clusters, weight_multipliers=multipliers, **kwargs)

    return clusters

fuzzy_cluster(words, cutoff=70, to_dataframe=False, merge_output=True, **kwargs)

summary

Parameters:

Name Type Description Default
words List[Dict]

Words/entities for clustering.

required
cutoff int

Cutoff threshold value for fuzzy ratios when forming clusters. Defaults to 70.

70
to_dataframe bool

Output as dataframe? Defaults to True.

False
merge_output bool

Merge output with original input? Defaults to False.

True

Returns:

Type Description
List[Dict]

Clusters of entities.

Source code in fuzzup/fuzz.py
def fuzzy_cluster(
    words: List[Dict],
    cutoff: int = 70,
    to_dataframe: bool = False,
    merge_output: bool = True,
    **kwargs,
) -> List[Dict]:
    """_summary_

    Args:
        words (List[Dict]): Words/entities for clustering.
        cutoff (int, optional): Cutoff threshold value for fuzzy
            ratios when forming clusters. Defaults to 70.
        to_dataframe (bool, optional): Output as dataframe?
            Defaults to True.
        merge_output (bool, optional): Merge output with
            original input? Defaults to False.

    Returns:
        List[Dict]: Clusters of entities.
    """

    assert isinstance(words, list), "'words' must be a list"

    # Remove existing cluster_id entries in words
    for word in words:
        if "cluster_id" in word:
            del word["cluster_id"]

    # handle trivial case (empty list)
    if not words:
        if to_dataframe:
            return pd.DataFrame()
        else:
            return []

    if isinstance(words, list) and all([isinstance(x, dict) for x in words]):
        output_ner = True
        strings = [x.get("word") for x in words]
    else:
        output_ner = False
        strings = words

    # compute fuzzy ratios
    fuzzy_matrix = compute_fuzzy_matrix(strings, **kwargs)

    clusters = naive_cluster(fuzzy_matrix, cutoff=cutoff)

    # generate cluster ids (longest entity variation).
    cluster_ids = [max(cluster, key=len) for cluster in clusters]

    # organize output properly (for compatibility with transformers NER pipeline)
    output = []
    for idx, cluster in enumerate(clusters):
        output.append(
            pd.DataFrame.from_dict({"word": cluster, CLUSTER_ID: cluster_ids[idx]})
        )
    output = pd.concat(output, ignore_index=True)

    # merge output with original input
    if output_ner and merge_output:
        output = pd.merge(pd.DataFrame.from_dict(words), output, how="left")

    if not to_dataframe:
        output = output.to_dict(orient="records")

    return output

fuzzy_cluster_bygroup(words, **kwargs)

Fuzzy Cluster By Group

Fuzzy clustering by entity group. Simply calls fuzzy_cluster() groupwise.

Parameters:

Name Type Description Default
words List[Dict]

Words/entities.

required
kwargs

all optional arguments for fuzzy_cluster().

{}

Returns:

Type Description
List[Dict]

entity clusters.

Source code in fuzzup/fuzz.py
def fuzzy_cluster_bygroup(words: List[Dict], **kwargs) -> List[Dict]:
    """Fuzzy Cluster By Group

    Fuzzy clustering by entity group. Simply
    calls fuzzy_cluster() groupwise.

    Args:
        words (List[Dict]): Words/entities.
        kwargs: all optional arguments for
            fuzzy_cluster().

    Returns:
        List[Dict]: entity clusters.
    """
    # handle trivial case.
    if len(words) == 0:
        return []

    words = pd.DataFrame.from_dict(words)
    words = words.groupby(["entity_group"])

    out = [
        fuzzy_cluster(words=words.get_group(group).to_dict(orient="records"), **kwargs)
        for group in words.groups
    ]

    out = flatten(out)

    return out

naive_cluster(fuzzy_matrix, cutoff=70, **kwargs)

Naive Clustering

Conducts naive clustering based on matrix with pairwise correlations, fuzzy ratios etc.

Parameters:

Name Type Description Default
fuzzy_matrix pd.DataFrame

Matrix with pairwise fuzzy ratios between words.

required
cutoff float

Threshold for naive clustering algorithm with respect to pairwise fuzzy ratios. Defaults to 70.

70

Returns:

Type Description
list

resulting clusters.

Source code in fuzzup/fuzz.py
def naive_cluster(fuzzy_matrix: pd.DataFrame, cutoff: float = 70, **kwargs) -> list:
    """Naive Clustering

    Conducts naive clustering based on matrix with
    pairwise correlations, fuzzy ratios etc.

    Args:
        fuzzy_matrix (pd.DataFrame): Matrix with
            pairwise fuzzy ratios between words.
        cutoff (float, optional): Threshold for naive
            clustering algorithm with respect to
            pairwise fuzzy ratios. Defaults to 70.

    Returns:
        list: resulting clusters.
    """
    m = fuzzy_matrix
    clusters = []
    while len(m) > 0:
        var = [m.index.tolist()[0]]
        cluster, m = helper_clustering(m, var, cutoff=cutoff)
        clusters.append(cluster)

    return clusters

simulate_ner_data()

Simulate NER data

Returns:

Type Description
List[Dict]

Simulated NER data.

Source code in fuzzup/datasets.py
def simulate_ner_data() -> List[Dict]:
    """Simulate NER data

    Returns:
        List[Dict]: Simulated NER data.
    """
    PERSONS = ['Donald Trump', 'Donald Trump', 'J. biden', 'joe biden', 'Biden', 'Bide', 'mark esper', 'Christopher c . miller', 'jim mattis', 'Nancy Pelosi', 'trumps', 'Trump', 'Donald', 'miller']
    # align format with output from Hugging Face `transformers` pipeline
    n = len(PERSONS)
    PERSONS_NER = pd.DataFrame(data = PERSONS, columns=['word'])
    PERSONS_NER["entity_group"] = "PER"
    PERSONS_NER["score"] = np.random.sample(n)
    PERSONS_NER["start"] = np.random.randint(100, size=n)
    PERSONS_NER["end"] = np.random.randint(100, size=n)
    placements = ["title", "lead", "body"]
    PERSONS_NER["placement"] = random.choices(placements, k=n)
    PERSONS_NER = PERSONS_NER.to_dict(orient="records")
    return PERSONS_NER

Cities (Whitelist)

Danish Cities

Whitelist of names of Danish cities initialized from the DAWA API.

Source code in fuzzup/whitelists.py
class Cities(Whitelist):
    """Danish Cities

    Whitelist of names of Danish cities
    initialized from the DAWA API.
    """

    def __init__(self, **kwargs):

        super().__init__(
            function_load=get_eblocal_byer, title="city", entity_group=["LOC"], **kwargs
        )

EBLocalNames (Whitelist)

EB Local Names

Whitelist with Ekstra Bladet Local Names.

Source code in fuzzup/whitelists.py
class EBLocalNames(Whitelist):
    """EB Local Names

    Whitelist with Ekstra Bladet Local Names.
    """

    def __init__(self, **kwargs):
        super().__init__(
            function_load=get_eblocal_names,
            title="eblocal_name",
            entity_group=["LOC"],
            **kwargs,
        )

Municipalities (Whitelist)

Danish Cities

Whitelist of names of Danish Municipalities initialized from the DAWA API.

Source code in fuzzup/whitelists.py
class Municipalities(Whitelist):
    """Danish Cities

    Whitelist of names of Danish Municipalities
    initialized from the DAWA API.
    """

    def __init__(self, **kwargs):

        super().__init__(
            function_load=get_municipalities,
            title="municipality",
            entity_group=["LOC"],
            **kwargs,
        )

Neighborhoods (Whitelist)

Danish Neighborhoods

Whitelist of names of Danish Neighborhoods initialized from the DAWA API.

Source code in fuzzup/whitelists.py
class Neighborhoods(Whitelist):
    """Danish Neighborhoods

    Whitelist of names of Danish Neighborhoods
    initialized from the DAWA API.
    """

    def __init__(self, **kwargs):
        super().__init__(
            function_load=get_eblocal_neighborhoods,
            title="neighborhood",
            entity_group=["LOC"],
            **kwargs,
        )

Whitelist

Whitelist

Whitelist objects containing whitelists and relevant meta data regarding how to apply it.

Attributes:

Name Type Description
entity_group str

the entity group of interest for the given whitelist.

title str

title of the type of entity the whitelist relates to.

whitelist dict

whitelist with keys to match with. The values contain mappings for the given key.

Source code in fuzzup/whitelists.py
class Whitelist:
    """Whitelist

    Whitelist objects containing whitelists and
    relevant meta data regarding how to apply
    it.

    Attributes:
        entity_group (str): the entity group of interest
            for the given whitelist.
        title (str): title of the type of entity the
            whitelist relates to.
        whitelist (dict): whitelist with keys to
            match with. The values contain mappings
            for the given key.
    """

    def __init__(self, function_load, title, entity_group, **kwargs) -> None:

        self.entity_group = entity_group
        self.title = title
        logger.info(f"Loading whitelist: {title}")
        self.whitelist = function_load(**kwargs)
        logger.info("Done loading.")

    def __call__(self, words: List[Dict], **kwargs) -> List[Dict]:

        out = match_whitelist(
            words=words,
            whitelist=self.whitelist,
            entity_group=self.entity_group,
            **kwargs,
        )

        return out

apply_whitelists(whitelists, clusters, **kwargs)

Apply Multiple Whitelists

Parameters:

Name Type Description Default
whitelists List[Whitelist]

Whitelists.

required
clusters List[Dict]

Results from fuzzy clustering etc.

required
kwargs

all optional arguments for whitelist matching.

{}

Returns:

Type Description
Dict

output from whitelist applications.

Source code in fuzzup/whitelists.py
def apply_whitelists(
    whitelists: List[Whitelist],
    clusters: List[Dict],
    **kwargs,
) -> pd.DataFrame:
    """Apply Multiple Whitelists

    Args:
        whitelists (List[Whitelist]): Whitelists.
        clusters (List[Dict]): Results from fuzzy clustering etc.
        kwargs: all optional arguments for whitelist matching.

    Returns:
        Dict: output from whitelist applications.
    """
    out = {wl.title: wl(clusters, **kwargs) for wl in whitelists}
    return out

format_output(results, columns=['neighborhood_code', 'city_code', 'municipality_code'], drop_duplicates=True)

Format Output

Formats output from whitelist format by extracting only specific columns and converting them to a pandas DataFrame.

Parameters:

Name Type Description Default
results List[Dict]

Results from Fuzzy Clustering.

required
columns List[str]

Desired columns to extract. Defaults to ['neighborhood_code', 'city_code', 'municipality_code'].

['neighborhood_code', 'city_code', 'municipality_code']
drop_duplicates bool

Drop duplicate matches? Defaults to True.

True

Returns:

Type Description
pd.DataFrame

Output in desired format.

Source code in fuzzup/whitelists.py
def format_output(
    results: List[Dict],
    columns: List[str] = ["neighborhood_code", "city_code", "municipality_code"],
    drop_duplicates: bool = True,
) -> pd.DataFrame:
    """Format Output

    Formats output from whitelist format by extracting
    only specific columns and converting them to
    a pandas DataFrame.

    Args:
        results (List[Dict]): Results from Fuzzy Clustering.
        columns (List[str], optional): Desired columns
            to extract. Defaults to
            ['neighborhood_code', 'city_code',
            'municipality_code'].
        drop_duplicates (bool, optional): Drop duplicate
            matches? Defaults to True.

    Returns:
        pd.DataFrame: Output in desired format.
    """
    results = [format_helper(x=results.get(x), columns=columns) for x in results]
    results = pd.concat(results, ignore_index=True)
    if drop_duplicates:
        results.drop_duplicates(inplace=True, keep="first")
    return results

get_neighborhoods()

Get all neighborhoods in DK

Source code in fuzzup/whitelists.py
def get_neighborhoods():
    """Get all neighborhoods in DK"""
    url = "https://api.dataforsyningen.dk/steder?hovedtype=Bebyggelse&undertype=bydel"
    hoods = requests.get(url).json()
    out = {hood["primærtnavn"]: {"eblocal_code": hood["id"]} for hood in hoods}
    return out

get_politicians()

copy pasta from https://github.com/cfblaeb/politik

Source code in fuzzup/whitelists.py
def get_politicians():
    """
    copy pasta from https://github.com/cfblaeb/politik
    """

    # ft.dk only allows 100 rows per call
    table = "Aktør"
    url = f"http://oda.ft.dk/api/{table}"
    totalcount = int(
        requests.get(url, params={"$inlinecount": "allpages"}).json()["odata.count"]
    )
    ccount = 0
    print(f"# records: {totalcount}")
    results = []
    while ccount < totalcount:
        r = requests.get(url, params={"$skip": ccount})
        for row in r.json()["value"]:
            if row.get("typeid") == 5:  # Type_ID 5 = Politiker i folketinget.
                if all(
                    [
                        row.get("slutdato") is None,
                        row.get("startdato") is not None,
                        row.get("fornavn") is not None,
                        row.get("efternavn") is not None,
                    ]
                ):
                    results.append(row)
            else:
                pass

        ccount += 100
        if ccount % 1000 == 0:
            print(f"# records processed: {ccount}/{totalcount}")

    print(f"Number of politicians identified: {len(results)}")

    # extract names
    names = [x.get("fornavn") + " " + x.get("efternavn") for x in results]
    names = [clean_string(x).strip() for x in names]
    names.sort()

    # convert to fuzzup dict format
    names = {x: {} for x in names}

    return names

match_whitelist(words, whitelist, score_cutoff=80, to_dataframe=False, aggregate_cluster=False, individual_wl_match=True, match_strategy=False, entity_group=None, **kwargs)

Match entities with white list

Parameters:

Name Type Description Default
words List[Dict]

words/entities for matching.

required
whitelist List[str]

white list with words/entities to match with.

required
score_cutoff float

Cutoff threshold value for matching. Defaults to 80.

80
to_dataframe bool

Return output as data frame. Defaults to False.

False
aggregate_cluster bool

Aggregate matches to cluster level. Defaults to False.

False
kwargs

optinal arguments for cdist.

{}
entity_group List[str]

which entity groups to match.

None

Returns:

Type Description
List[Dict]

words and their respective matches with the white list.

Source code in fuzzup/whitelists.py
def match_whitelist(
    words: List[Dict],
    whitelist: List[str],
    score_cutoff: float = 80,
    to_dataframe: bool = False,
    aggregate_cluster: bool = False,
    individual_wl_match: bool = True,
    match_strategy: bool = False,
    entity_group: List[str] = None,
    **kwargs,
) -> List[Dict]:
    """Match entities with white list

    Args:
        words (List[Dict]): words/entities for matching.
        whitelist (List[str]): white list with words/entities
            to match with.
        score_cutoff (float, optional): Cutoff threshold value for
            matching. Defaults to 80.
        to_dataframe (bool, optional): Return output as data frame.
            Defaults to False.
        aggregate_cluster (bool, optional): Aggregate matches to
            cluster level. Defaults to False.
        kwargs: optinal arguments for cdist.
        entity_group: which entity groups to match.

    Returns:
        List[Dict]: words and their respective matches with the
            white list.
    """
    assert isinstance(words, list), "'words' must be a list"
    assert isinstance(whitelist, (list, dict)), "'whitelist' must be a list or dit"

    def filter_rank_df(
        df_filter: pd.DataFrame,
        df: pd.DataFrame,
        rank_limit: int = 1,
        min_count: int = 2,
    ) -> pd.DataFrame:

        rank_list = df_filter.query(
            "count >= 2 and prominence_rank == 2 or prominence_rank==1"
        ).cluster_id.tolist()
        df = df.query("cluster_id in @rank_list")
        return df

    def count_word_prominence_freq(strings: List[str]) -> pd.Series:
        """
        This function will take a list of strings, compare it to their prominence rank and
        count, in order to decide which strings should be returned.
        """
        return pd.value_counts(np.array(strings))

    def convert_version_list_to_set(row: pd.Series) -> pd.Series:
        """This method will convert any list of version numbers to a set of unique versions"""
        row["versions"] = set(row["versions"].tolist())
        return row

    # if the whitelist is a dictionary, then generate a list of keys for later use
    is_dict = False
    if isinstance(whitelist, dict):
        is_dict = True
        whitelist_dict = whitelist
        whitelist = list(whitelist.keys())
        whitelist_versions = [x.get("version") for x in whitelist_dict.values()]

    # handle trivial case (empty list)
    if not words or not whitelist:
        if to_dataframe:
            return pd.DataFrame()
        else:
            return []
    # if all words are dictionaries, it is assumed to be NER format
    if isinstance(words, list) and all([isinstance(x, dict) for x in words]):
        output_ner = True
        if entity_group is not None:
            words = [x for x in words if x.get("entity_group") in entity_group]
        strings = [x.get("word") for x in words]

    else:
        output_ner = False
        strings = words

    if len(strings) == 0:
        if to_dataframe:
            return pd.DataFrame()
        else:
            return []

    # compute distances - length of the whitelist
    # Only takes strings ~ so no information about prominence here.
    dists = cdist(whitelist, strings, score_cutoff=score_cutoff, **kwargs)

    # All matches on the whitelist ~ pass all words here to start with and then handle matches afterwards
    matches = [np.array(whitelist)[np.where(col)] for col in dists.T]
    if is_dict:
        versions = [np.array(whitelist_versions)[np.where(col)] for col in dists.T]
    if not output_ner:
        df = pd.DataFrame.from_dict({"word": strings, "matches": matches})

    if output_ner:
        df = pd.DataFrame.from_records(words)
        df["matches"] = matches
        if is_dict:
            df["versions"] = versions
            df = df.apply(convert_version_list_to_set, axis=1)

        # MATCH STRATEGY
        if "prominence_rank" in df and match_strategy is True:
            df_filter = (
                df.groupby(["cluster_id", "prominence_rank"])["word"]
                .count()
                .reset_index()
                .rename({"word": "count"}, axis=1)
            )

            # If rank 2 has an occurance of at least 2, regardless of prominence_score
            if (df_filter[df_filter["prominence_rank"] == 2]["count"] >= 2).any():
                df = filter_rank_df(
                    df_filter=df_filter, df=df, rank_limit=2, min_count=2
                )

            # Regress to match-strategy of returning rank 1 only
            else:
                df = df[df["prominence_rank"] == 1]  # return first rank only

        if aggregate_cluster:
            matches = pd.DataFrame(
                df.groupby(by=["cluster_id"]).apply(aggregate_to_cluster),
                columns=["matches"],
                index=None,
            )
            matches = matches.reset_index()
            df.drop("matches", axis=1, inplace=True)
            df = pd.merge(df, matches, how="left")

    df["matches"] = [x.tolist() for x in df["matches"]]

    if is_dict:
        mappings = []
        for match in df.matches.tolist():
            out = [whitelist_dict.get(x) for x in match]
            mappings.append(out)
        df["mappings"] = mappings

    # subset matches only
    df = df[df["matches"].astype(str) != "[]"]

    if not to_dataframe:
        df = df.to_dict(orient="records")

    return df