Code Reference

`compute_fuzzy_matrix(strings, **kwargs)`

Compute Fuzzy Matrix

Computes matrix with pairwise fuzzy ratios (=edit) distances between all strings.

The result can be thought of as a correlation matrix with all diagonal elements equal to 100.

Parameters:

Name	Type	Description	Default
`strings`	`List[str]`	strings for clustering.	required
`kwargs`		all optional arguments for rapidfuzz.process.cdist.	`{}`

Returns:

Type	Description
`pd.DataFrame`	pairwise fuzzy ratios between strings.

Examples:

>>> person_names = ['Donald Trump', 'Donald Trump',
                    'J. biden', 'joe biden', 'Biden',
                    'Bide', 'mark esper',
                    'Christopher c . miller',
                    'jim mattis', 'Nancy Pelosi',
                    'trumps', 'Trump', 'Donald',
                    'miller']
....

Source code in fuzzup/fuzz.py

def compute_fuzzy_matrix(strings: List[str], **kwargs) -> pd.DataFrame:
    """Compute Fuzzy Matrix

    Computes matrix with pairwise fuzzy ratios (=edit)
    distances between all strings.

    The result can be thought of as a correlation
    matrix with all diagonal elements equal to 100.

    Args:
        strings (List[str]): strings for clustering.
        kwargs: all optional arguments for
            rapidfuzz.process.cdist.

    Returns:
        pd.DataFrame: pairwise fuzzy ratios between
            strings.

    Examples:
        >>> person_names = ['Donald Trump', 'Donald Trump',
                            'J. biden', 'joe biden', 'Biden',
                            'Bide', 'mark esper',
                            'Christopher c . miller',
                            'jim mattis', 'Nancy Pelosi',
                            'trumps', 'Trump', 'Donald',
                            'miller']
        ....

    """

    # subset unique strings.
    strings = list(set(strings))

    # compute edit distances
    dists = cdist(strings, strings, **kwargs)

    dists = pd.DataFrame(dists, index=strings, columns=strings)

    return dists

`compute_prominence(clusters, to_dataframe=False, merge_output=True, weight_position=None, weight_multipliers=None)`

Compute Prominence

Computes prominence of entity clusters.

Parameters:

Name	Type	Description	Default
`clusters`	`List[Dict]`	Entity clusters.	required
`to_dataframe`	`bool`	Export output as pandas dataframe? Defaults to False.	`False`
`merge_output`	`bool`	Merge resulting cluster meta data with input data. Defaults to True.	`True`
`weight_position`	`float`	threshold for position-adjusted weight interpolation. Defaults to None implying no adjustment for positions in text.	`None`
`weight_multipliers`	`ndarray`	weight multipliers.	`None`

Returns:

Type	Description
`List[Dict]`	clusters and their prominence.

Examples:

...

Source code in fuzzup/fuzz.py

def compute_prominence(
    clusters: List[Dict],
    to_dataframe: bool = False,
    merge_output: bool = True,
    weight_position: float = None,
    weight_multipliers: np.ndarray = None,
) -> List[Dict]:
    """Compute Prominence

    Computes prominence of entity clusters.

    Args:
        clusters (List[Dict]): Entity clusters.
        to_dataframe (bool, optional): Export output
            as pandas dataframe? Defaults to False.
        merge_output (bool, optional): Merge resulting
            cluster meta data with input data. Defaults to True.
        weight_position: threshold for position-adjusted
            weight interpolation. Defaults to None implying
            no adjustment for positions in text.
        weight_multipliers: weight multipliers.

    Returns:
        List[Dict]: clusters and their prominence.
    Examples:
        ...
    """
    # handle trivial case (empty list)
    if len(clusters) == 0:
        if to_dataframe:
            return pd.DataFrame()
        else:
            return []

    # validate inputs
    if weight_position is not None:
        assert 0 <= weight_position <= 1, "choose 'weight_position' between 0 and 1"
    if weight_multipliers is not None:
        assert len(weight_multipliers) == len(
            clusters
        ), "Multipliers must have same length as number of entities"
    else:
        weight_multipliers = float(1)

    clusters = pd.DataFrame.from_dict(clusters)

    prominence = clusters.copy()
    prominence_score = float(1)

    # adjust prominence score for word positions (=offsets)
    if weight_position is not None and len(clusters.start) > 1:
        offset_min = min(clusters.start)
        offset_max = max(clusters.start)
        # linear interpolation
        xp = [offset_min, offset_max]
        yp = [1, weight_position]
        prominence_position = np.array([np.interp(x, xp, yp) for x in clusters.start])
    else:
        prominence_position = float(1)

    prominence_score = prominence_score * prominence_position * weight_multipliers

    # aggregate prominence to group level
    prominence["prominence_score"] = prominence_score
    prominence = prominence.groupby(CLUSTER_ID)["prominence_score"].sum()

    # rank clusters by prominence
    ranks = rankdata(prominence, method="max")
    ranks = max(ranks) - ranks + 1

    # organize output as data frame
    prominence = pd.DataFrame(prominence)
    prominence["prominence_rank"] = ranks
    prominence.reset_index(level=0, inplace=True)

    if merge_output:
        prominence = pd.merge(clusters, prominence, how="left")

    if not to_dataframe:
        prominence = prominence.to_dict(orient="records")

    return prominence

`compute_prominence_bygroup(clusters, return_first_rank=False, **kwargs)`

Compute Prominence by Group

Computes prominence by entity group. Simply calls compute_prominence() groupwise.

Parameters:

Name	Type	Description	Default
`clusters`	`List[Dict]`	Entity clusters.	required
`kwargs`		all optional arguments for compute_prominence().	`{}`

Returns:

Type	Description
`List[Dict]`	entity clusters with prominence scores.

Source code in fuzzup/fuzz.py

def compute_prominence_bygroup(
    clusters: List[Dict], return_first_rank: bool = False, **kwargs
) -> List[Dict]:
    """Compute Prominence by Group

    Computes prominence by entity group. Simply
    calls compute_prominence() groupwise.

    Args:
        clusters (List[Dict]): Entity clusters.
        kwargs: all optional arguments for
            compute_prominence().

    Returns:
        List[Dict]: entity clusters with
            prominence scores.
    """
    # handle trivial case.
    if len(clusters) == 0:
        return []

    clusters = pd.DataFrame.from_dict(clusters)
    clusters = clusters.groupby(["entity_group"])

    out = [
        compute_prominence(
            clusters=clusters.get_group(group).to_dict(orient="records"), **kwargs
        )
        for group in clusters.groups
    ]
    out = flatten(out)

    # If you only want the most prominent entities returned, pop all entities that are not the most prominent
    if return_first_rank:
        out = [i for i in out if i["prominence_rank"] == 1]
    return out

`compute_prominence_placement(clusters, placement_col='placement', wgt_body=1.0, wgt_lead=2.0, wgt_title=3.0, bygroup=False, **kwargs)`

Compute Prominence from Article Placement

Parameters:

Name	Type	Description	Default
`clusters`	`list`	NER predictions.	required
`placement_col`	`str`	Name of column containing article placement of entities. Defaults to "placement".	`'placement'`
`wgt_body`	`float`	Weight of entities in body text. Defaults to 1.0.	`1.0`
`wgt_lead`	`float`	Weight of entities in lead text. Defaults to 2.0.	`2.0`
`wgt_title`	`float`	Weight of entities in title. Defaults to 3.0.	`3.0`
`bygroup`	`bool`	use compute_prominence_bygroup() in stead of compute_prominence()? Defaults to True.	`False`
`kwargs`		all optional arguments for compute_prominence(bygroup).	`{}`

Returns:

Type	Description
`list`	predictions with prominence scores.

Source code in fuzzup/fuzz.py

def compute_prominence_placement(
    clusters: list,
    placement_col: str = "placement",
    wgt_body: float = 1.0,
    wgt_lead: float = 2.0,
    wgt_title: float = 3.0,
    bygroup: bool = False,
    **kwargs,
) -> list:
    """Compute Prominence from Article Placement

    Args:
        clusters (list): NER predictions.
        placement_col (str, optional): Name of column containing article
            placement of entities. Defaults to "placement".
        wgt_body (float, optional): Weight of entities in body
            text. Defaults to 1.0.
        wgt_lead (float, optional): Weight of entities in lead
            text. Defaults to 2.0.
        wgt_title (float, optional): Weight of entities in title.
            Defaults to 3.0.
        bygroup (bool, optional): use compute_prominence_bygroup()
            in stead of compute_prominence()? Defaults to True.
        kwargs: all optional arguments for compute_prominence(bygroup).

    Returns:
        list: predictions with prominence scores.

    """

    if len(clusters) == 0:
        return []

    assert all(
        [placement_col in x for x in clusters]
    ), f"key {placement_col} must be present in all dicts"

    weights = {"body": wgt_body, "lead": wgt_lead, "title": wgt_title}

    multipliers = np.array([weights.get(x.get(placement_col)) for x in clusters])

    if bygroup:
        prominence_function = compute_prominence_bygroup
    else:
        prominence_function = compute_prominence

    clusters = prominence_function(clusters, weight_multipliers=multipliers, **kwargs)

    return clusters

`fuzzy_cluster(words, cutoff=70, to_dataframe=False, merge_output=True, **kwargs)`

summary

Parameters:

Name	Type	Description	Default
`words`	`List[Dict]`	Words/entities for clustering.	required
`cutoff`	`int`	Cutoff threshold value for fuzzy ratios when forming clusters. Defaults to 70.	`70`
`to_dataframe`	`bool`	Output as dataframe? Defaults to True.	`False`
`merge_output`	`bool`	Merge output with original input? Defaults to False.	`True`

Returns:

Type	Description
`List[Dict]`	Clusters of entities.

Source code in fuzzup/fuzz.py

def fuzzy_cluster(
    words: List[Dict],
    cutoff: int = 70,
    to_dataframe: bool = False,
    merge_output: bool = True,
    **kwargs,
) -> List[Dict]:
    """_summary_

    Args:
        words (List[Dict]): Words/entities for clustering.
        cutoff (int, optional): Cutoff threshold value for fuzzy
            ratios when forming clusters. Defaults to 70.
        to_dataframe (bool, optional): Output as dataframe?
            Defaults to True.
        merge_output (bool, optional): Merge output with
            original input? Defaults to False.

    Returns:
        List[Dict]: Clusters of entities.
    """

    assert isinstance(words, list), "'words' must be a list"

    # Remove existing cluster_id entries in words
    for word in words:
        if "cluster_id" in word:
            del word["cluster_id"]

    # handle trivial case (empty list)
    if not words:
        if to_dataframe:
            return pd.DataFrame()
        else:
            return []

    if isinstance(words, list) and all([isinstance(x, dict) for x in words]):
        output_ner = True
        strings = [x.get("word") for x in words]
    else:
        output_ner = False
        strings = words

    # compute fuzzy ratios
    fuzzy_matrix = compute_fuzzy_matrix(strings, **kwargs)

    clusters = naive_cluster(fuzzy_matrix, cutoff=cutoff)

    # generate cluster ids (longest entity variation).
    cluster_ids = [max(cluster, key=len) for cluster in clusters]

    # organize output properly (for compatibility with transformers NER pipeline)
    output = []
    for idx, cluster in enumerate(clusters):
        output.append(
            pd.DataFrame.from_dict({"word": cluster, CLUSTER_ID: cluster_ids[idx]})
        )
    output = pd.concat(output, ignore_index=True)

    # merge output with original input
    if output_ner and merge_output:
        output = pd.merge(pd.DataFrame.from_dict(words), output, how="left")

    if not to_dataframe:
        output = output.to_dict(orient="records")

    return output

`fuzzy_cluster_bygroup(words, **kwargs)`

Fuzzy Cluster By Group

Fuzzy clustering by entity group. Simply calls fuzzy_cluster() groupwise.

Parameters:

Name	Type	Description	Default
`words`	`List[Dict]`	Words/entities.	required
`kwargs`		all optional arguments for fuzzy_cluster().	`{}`

Returns:

Type	Description
`List[Dict]`	entity clusters.

Source code in fuzzup/fuzz.py

def fuzzy_cluster_bygroup(words: List[Dict], **kwargs) -> List[Dict]:
    """Fuzzy Cluster By Group

    Fuzzy clustering by entity group. Simply
    calls fuzzy_cluster() groupwise.

    Args:
        words (List[Dict]): Words/entities.
        kwargs: all optional arguments for
            fuzzy_cluster().

    Returns:
        List[Dict]: entity clusters.
    """
    # handle trivial case.
    if len(words) == 0:
        return []

    words = pd.DataFrame.from_dict(words)
    words = words.groupby(["entity_group"])

    out = [
        fuzzy_cluster(words=words.get_group(group).to_dict(orient="records"), **kwargs)
        for group in words.groups
    ]

    out = flatten(out)

    return out

`naive_cluster(fuzzy_matrix, cutoff=70, **kwargs)`

Naive Clustering

Conducts naive clustering based on matrix with pairwise correlations, fuzzy ratios etc.

Parameters:

Name	Type	Description	Default
`fuzzy_matrix`	`pd.DataFrame`	Matrix with pairwise fuzzy ratios between words.	required
`cutoff`	`float`	Threshold for naive clustering algorithm with respect to pairwise fuzzy ratios. Defaults to 70.	`70`

Returns:

Type	Description
`list`	resulting clusters.

Source code in fuzzup/fuzz.py

def naive_cluster(fuzzy_matrix: pd.DataFrame, cutoff: float = 70, **kwargs) -> list:
    """Naive Clustering

    Conducts naive clustering based on matrix with
    pairwise correlations, fuzzy ratios etc.

    Args:
        fuzzy_matrix (pd.DataFrame): Matrix with
            pairwise fuzzy ratios between words.
        cutoff (float, optional): Threshold for naive
            clustering algorithm with respect to
            pairwise fuzzy ratios. Defaults to 70.

    Returns:
        list: resulting clusters.
    """
    m = fuzzy_matrix
    clusters = []
    while len(m) > 0:
        var = [m.index.tolist()[0]]
        cluster, m = helper_clustering(m, var, cutoff=cutoff)
        clusters.append(cluster)

    return clusters

`simulate_ner_data()`

Simulate NER data

Returns:

Type	Description
`List[Dict]`	Simulated NER data.

Source code in fuzzup/datasets.py

def simulate_ner_data() -> List[Dict]:
    """Simulate NER data

    Returns:
        List[Dict]: Simulated NER data.
    """
    PERSONS = ['Donald Trump', 'Donald Trump', 'J. biden', 'joe biden', 'Biden', 'Bide', 'mark esper', 'Christopher c . miller', 'jim mattis', 'Nancy Pelosi', 'trumps', 'Trump', 'Donald', 'miller']
    # align format with output from Hugging Face `transformers` pipeline
    n = len(PERSONS)
    PERSONS_NER = pd.DataFrame(data = PERSONS, columns=['word'])
    PERSONS_NER["entity_group"] = "PER"
    PERSONS_NER["score"] = np.random.sample(n)
    PERSONS_NER["start"] = np.random.randint(100, size=n)
    PERSONS_NER["end"] = np.random.randint(100, size=n)
    placements = ["title", "lead", "body"]
    PERSONS_NER["placement"] = random.choices(placements, k=n)
    PERSONS_NER = PERSONS_NER.to_dict(orient="records")
    return PERSONS_NER

`Cities (Whitelist)`

Danish Cities

Whitelist of names of Danish cities initialized from the DAWA API.

Source code in fuzzup/whitelists.py

class Cities(Whitelist):
    """Danish Cities

    Whitelist of names of Danish cities
    initialized from the DAWA API.
    """

    def __init__(self, **kwargs):

        super().__init__(
            function_load=get_eblocal_byer, title="city", entity_group=["LOC"], **kwargs
        )

`EBLocalNames (Whitelist)`

EB Local Names

Whitelist with Ekstra Bladet Local Names.

Source code in fuzzup/whitelists.py

class EBLocalNames(Whitelist):
    """EB Local Names

    Whitelist with Ekstra Bladet Local Names.
    """

    def __init__(self, **kwargs):
        super().__init__(
            function_load=get_eblocal_names,
            title="eblocal_name",
            entity_group=["LOC"],
            **kwargs,
        )

`Municipalities (Whitelist)`

Danish Cities

Whitelist of names of Danish Municipalities initialized from the DAWA API.

Source code in fuzzup/whitelists.py

class Municipalities(Whitelist):
    """Danish Cities

    Whitelist of names of Danish Municipalities
    initialized from the DAWA API.
    """

    def __init__(self, **kwargs):

        super().__init__(
            function_load=get_municipalities,
            title="municipality",
            entity_group=["LOC"],
            **kwargs,
        )

`Neighborhoods (Whitelist)`

Danish Neighborhoods

Whitelist of names of Danish Neighborhoods initialized from the DAWA API.

Source code in fuzzup/whitelists.py

class Neighborhoods(Whitelist):
    """Danish Neighborhoods

    Whitelist of names of Danish Neighborhoods
    initialized from the DAWA API.
    """

    def __init__(self, **kwargs):
        super().__init__(
            function_load=get_eblocal_neighborhoods,
            title="neighborhood",
            entity_group=["LOC"],
            **kwargs,
        )

`Whitelist`

Whitelist

Whitelist objects containing whitelists and relevant meta data regarding how to apply it.

Attributes:

Name	Type	Description
`entity_group`	`str`	the entity group of interest for the given whitelist.
`title`	`str`	title of the type of entity the whitelist relates to.
`whitelist`	`dict`	whitelist with keys to match with. The values contain mappings for the given key.

Source code in fuzzup/whitelists.py

class Whitelist:
    """Whitelist

    Whitelist objects containing whitelists and
    relevant meta data regarding how to apply
    it.

    Attributes:
        entity_group (str): the entity group of interest
            for the given whitelist.
        title (str): title of the type of entity the
            whitelist relates to.
        whitelist (dict): whitelist with keys to
            match with. The values contain mappings
            for the given key.
    """

    def __init__(self, function_load, title, entity_group, **kwargs) -> None:

        self.entity_group = entity_group
        self.title = title
        logger.info(f"Loading whitelist: {title}")
        self.whitelist = function_load(**kwargs)
        logger.info("Done loading.")

    def __call__(self, words: List[Dict], **kwargs) -> List[Dict]:

        out = match_whitelist(
            words=words,
            whitelist=self.whitelist,
            entity_group=self.entity_group,
            **kwargs,
        )

        return out

`apply_whitelists(whitelists, clusters, **kwargs)`

Apply Multiple Whitelists

Parameters:

Name	Type	Description	Default
`whitelists`	`List[Whitelist]`	Whitelists.	required
`clusters`	`List[Dict]`	Results from fuzzy clustering etc.	required
`kwargs`		all optional arguments for whitelist matching.	`{}`

Returns:

Type	Description
`Dict`	output from whitelist applications.

Source code in fuzzup/whitelists.py

def apply_whitelists(
    whitelists: List[Whitelist],
    clusters: List[Dict],
    **kwargs,
) -> pd.DataFrame:
    """Apply Multiple Whitelists

    Args:
        whitelists (List[Whitelist]): Whitelists.
        clusters (List[Dict]): Results from fuzzy clustering etc.
        kwargs: all optional arguments for whitelist matching.

    Returns:
        Dict: output from whitelist applications.
    """
    out = {wl.title: wl(clusters, **kwargs) for wl in whitelists}
    return out

`format_output(results, columns=['neighborhood_code', 'city_code', 'municipality_code'], drop_duplicates=True)`

Format Output

Formats output from whitelist format by extracting only specific columns and converting them to a pandas DataFrame.

Parameters:

Name	Type	Description	Default
`results`	`List[Dict]`	Results from Fuzzy Clustering.	required
`columns`	`List[str]`	Desired columns to extract. Defaults to ['neighborhood_code', 'city_code', 'municipality_code'].	`['neighborhood_code', 'city_code', 'municipality_code']`
`drop_duplicates`	`bool`	Drop duplicate matches? Defaults to True.	`True`

Returns:

Type	Description
`pd.DataFrame`	Output in desired format.

Source code in fuzzup/whitelists.py

def format_output(
    results: List[Dict],
    columns: List[str] = ["neighborhood_code", "city_code", "municipality_code"],
    drop_duplicates: bool = True,
) -> pd.DataFrame:
    """Format Output

    Formats output from whitelist format by extracting
    only specific columns and converting them to
    a pandas DataFrame.

    Args:
        results (List[Dict]): Results from Fuzzy Clustering.
        columns (List[str], optional): Desired columns
            to extract. Defaults to
            ['neighborhood_code', 'city_code',
            'municipality_code'].
        drop_duplicates (bool, optional): Drop duplicate
            matches? Defaults to True.

    Returns:
        pd.DataFrame: Output in desired format.
    """
    results = [format_helper(x=results.get(x), columns=columns) for x in results]
    results = pd.concat(results, ignore_index=True)
    if drop_duplicates:
        results.drop_duplicates(inplace=True, keep="first")
    return results

`get_neighborhoods()`

Get all neighborhoods in DK

Source code in fuzzup/whitelists.py

def get_neighborhoods():
    """Get all neighborhoods in DK"""
    url = "https://api.dataforsyningen.dk/steder?hovedtype=Bebyggelse&undertype=bydel"
    hoods = requests.get(url).json()
    out = {hood["primærtnavn"]: {"eblocal_code": hood["id"]} for hood in hoods}
    return out

`get_politicians()`

copy pasta from https://github.com/cfblaeb/politik

Source code in fuzzup/whitelists.py

def get_politicians():
    """
    copy pasta from https://github.com/cfblaeb/politik
    """

    # ft.dk only allows 100 rows per call
    table = "Aktør"
    url = f"http://oda.ft.dk/api/{table}"
    totalcount = int(
        requests.get(url, params={"$inlinecount": "allpages"}).json()["odata.count"]
    )
    ccount = 0
    print(f"# records: {totalcount}")
    results = []
    while ccount < totalcount:
        r = requests.get(url, params={"$skip": ccount})
        for row in r.json()["value"]:
            if row.get("typeid") == 5:  # Type_ID 5 = Politiker i folketinget.
                if all(
                    [
                        row.get("slutdato") is None,
                        row.get("startdato") is not None,
                        row.get("fornavn") is not None,
                        row.get("efternavn") is not None,
                    ]
                ):
                    results.append(row)
            else:
                pass

        ccount += 100
        if ccount % 1000 == 0:
            print(f"# records processed: {ccount}/{totalcount}")

    print(f"Number of politicians identified: {len(results)}")

    # extract names
    names = [x.get("fornavn") + " " + x.get("efternavn") for x in results]
    names = [clean_string(x).strip() for x in names]
    names.sort()

    # convert to fuzzup dict format
    names = {x: {} for x in names}

    return names

`match_whitelist(words, whitelist, score_cutoff=80, to_dataframe=False, aggregate_cluster=False, individual_wl_match=True, match_strategy=False, entity_group=None, **kwargs)`

Match entities with white list

Parameters:

Name	Type	Description	Default
`words`	`List[Dict]`	words/entities for matching.	required
`whitelist`	`List[str]`	white list with words/entities to match with.	required
`score_cutoff`	`float`	Cutoff threshold value for matching. Defaults to 80.	`80`
`to_dataframe`	`bool`	Return output as data frame. Defaults to False.	`False`
`aggregate_cluster`	`bool`	Aggregate matches to cluster level. Defaults to False.	`False`
`kwargs`		optinal arguments for cdist.	`{}`
`entity_group`	`List[str]`	which entity groups to match.	`None`

Returns:

Type	Description
`List[Dict]`	words and their respective matches with the white list.

Source code in fuzzup/whitelists.py

def match_whitelist(
    words: List[Dict],
    whitelist: List[str],
    score_cutoff: float = 80,
    to_dataframe: bool = False,
    aggregate_cluster: bool = False,
    individual_wl_match: bool = True,
    match_strategy: bool = False,
    entity_group: List[str] = None,
    **kwargs,
) -> List[Dict]:
    """Match entities with white list

    Args:
        words (List[Dict]): words/entities for matching.
        whitelist (List[str]): white list with words/entities
            to match with.
        score_cutoff (float, optional): Cutoff threshold value for
            matching. Defaults to 80.
        to_dataframe (bool, optional): Return output as data frame.
            Defaults to False.
        aggregate_cluster (bool, optional): Aggregate matches to
            cluster level. Defaults to False.
        kwargs: optinal arguments for cdist.
        entity_group: which entity groups to match.

    Returns:
        List[Dict]: words and their respective matches with the
            white list.
    """
    assert isinstance(words, list), "'words' must be a list"
    assert isinstance(whitelist, (list, dict)), "'whitelist' must be a list or dit"

    def filter_rank_df(
        df_filter: pd.DataFrame,
        df: pd.DataFrame,
        rank_limit: int = 1,
        min_count: int = 2,
    ) -> pd.DataFrame:

        rank_list = df_filter.query(
            "count >= 2 and prominence_rank == 2 or prominence_rank==1"
        ).cluster_id.tolist()
        df = df.query("cluster_id in @rank_list")
        return df

    def count_word_prominence_freq(strings: List[str]) -> pd.Series:
        """
        This function will take a list of strings, compare it to their prominence rank and
        count, in order to decide which strings should be returned.
        """
        return pd.value_counts(np.array(strings))

    def convert_version_list_to_set(row: pd.Series) -> pd.Series:
        """This method will convert any list of version numbers to a set of unique versions"""
        row["versions"] = set(row["versions"].tolist())
        return row

    # if the whitelist is a dictionary, then generate a list of keys for later use
    is_dict = False
    if isinstance(whitelist, dict):
        is_dict = True
        whitelist_dict = whitelist
        whitelist = list(whitelist.keys())
        whitelist_versions = [x.get("version") for x in whitelist_dict.values()]

    # handle trivial case (empty list)
    if not words or not whitelist:
        if to_dataframe:
            return pd.DataFrame()
        else:
            return []
    # if all words are dictionaries, it is assumed to be NER format
    if isinstance(words, list) and all([isinstance(x, dict) for x in words]):
        output_ner = True
        if entity_group is not None:
            words = [x for x in words if x.get("entity_group") in entity_group]
        strings = [x.get("word") for x in words]

    else:
        output_ner = False
        strings = words

    if len(strings) == 0:
        if to_dataframe:
            return pd.DataFrame()
        else:
            return []

    # compute distances - length of the whitelist
    # Only takes strings ~ so no information about prominence here.
    dists = cdist(whitelist, strings, score_cutoff=score_cutoff, **kwargs)

    # All matches on the whitelist ~ pass all words here to start with and then handle matches afterwards
    matches = [np.array(whitelist)[np.where(col)] for col in dists.T]
    if is_dict:
        versions = [np.array(whitelist_versions)[np.where(col)] for col in dists.T]
    if not output_ner:
        df = pd.DataFrame.from_dict({"word": strings, "matches": matches})

    if output_ner:
        df = pd.DataFrame.from_records(words)
        df["matches"] = matches
        if is_dict:
            df["versions"] = versions
            df = df.apply(convert_version_list_to_set, axis=1)

        # MATCH STRATEGY
        if "prominence_rank" in df and match_strategy is True:
            df_filter = (
                df.groupby(["cluster_id", "prominence_rank"])["word"]
                .count()
                .reset_index()
                .rename({"word": "count"}, axis=1)
            )

            # If rank 2 has an occurance of at least 2, regardless of prominence_score
            if (df_filter[df_filter["prominence_rank"] == 2]["count"] >= 2).any():
                df = filter_rank_df(
                    df_filter=df_filter, df=df, rank_limit=2, min_count=2
                )

            # Regress to match-strategy of returning rank 1 only
            else:
                df = df[df["prominence_rank"] == 1]  # return first rank only

        if aggregate_cluster:
            matches = pd.DataFrame(
                df.groupby(by=["cluster_id"]).apply(aggregate_to_cluster),
                columns=["matches"],
                index=None,
            )
            matches = matches.reset_index()
            df.drop("matches", axis=1, inplace=True)
            df = pd.merge(df, matches, how="left")

    df["matches"] = [x.tolist() for x in df["matches"]]

    if is_dict:
        mappings = []
        for match in df.matches.tolist():
            out = [whitelist_dict.get(x) for x in match]
            mappings.append(out)
        df["mappings"] = mappings

    # subset matches only
    df = df[df["matches"].astype(str) != "[]"]

    if not to_dataframe:
        df = df.to_dict(orient="records")

    return df