Code Reference
compute_fuzzy_matrix(strings, **kwargs)
Compute Fuzzy Matrix
Computes matrix with pairwise fuzzy ratios (=edit) distances between all strings.
The result can be thought of as a correlation matrix with all diagonal elements equal to 100.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
strings |
List[str] |
strings for clustering. |
required |
kwargs |
all optional arguments for rapidfuzz.process.cdist. |
{} |
Returns:
Type | Description |
---|---|
pd.DataFrame |
pairwise fuzzy ratios between strings. |
Examples:
>>> person_names = ['Donald Trump', 'Donald Trump',
'J. biden', 'joe biden', 'Biden',
'Bide', 'mark esper',
'Christopher c . miller',
'jim mattis', 'Nancy Pelosi',
'trumps', 'Trump', 'Donald',
'miller']
....
Source code in fuzzup/fuzz.py
def compute_fuzzy_matrix(strings: List[str], **kwargs) -> pd.DataFrame:
"""Compute Fuzzy Matrix
Computes matrix with pairwise fuzzy ratios (=edit)
distances between all strings.
The result can be thought of as a correlation
matrix with all diagonal elements equal to 100.
Args:
strings (List[str]): strings for clustering.
kwargs: all optional arguments for
rapidfuzz.process.cdist.
Returns:
pd.DataFrame: pairwise fuzzy ratios between
strings.
Examples:
>>> person_names = ['Donald Trump', 'Donald Trump',
'J. biden', 'joe biden', 'Biden',
'Bide', 'mark esper',
'Christopher c . miller',
'jim mattis', 'Nancy Pelosi',
'trumps', 'Trump', 'Donald',
'miller']
....
"""
# subset unique strings.
strings = list(set(strings))
# compute edit distances
dists = cdist(strings, strings, **kwargs)
dists = pd.DataFrame(dists, index=strings, columns=strings)
return dists
compute_prominence(clusters, to_dataframe=False, merge_output=True, weight_position=None, weight_multipliers=None)
Compute Prominence
Computes prominence of entity clusters.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
clusters |
List[Dict] |
Entity clusters. |
required |
to_dataframe |
bool |
Export output as pandas dataframe? Defaults to False. |
False |
merge_output |
bool |
Merge resulting cluster meta data with input data. Defaults to True. |
True |
weight_position |
float |
threshold for position-adjusted weight interpolation. Defaults to None implying no adjustment for positions in text. |
None |
weight_multipliers |
ndarray |
weight multipliers. |
None |
Returns:
Type | Description |
---|---|
List[Dict] |
clusters and their prominence. |
Examples:
...
Source code in fuzzup/fuzz.py
def compute_prominence(
clusters: List[Dict],
to_dataframe: bool = False,
merge_output: bool = True,
weight_position: float = None,
weight_multipliers: np.ndarray = None,
) -> List[Dict]:
"""Compute Prominence
Computes prominence of entity clusters.
Args:
clusters (List[Dict]): Entity clusters.
to_dataframe (bool, optional): Export output
as pandas dataframe? Defaults to False.
merge_output (bool, optional): Merge resulting
cluster meta data with input data. Defaults to True.
weight_position: threshold for position-adjusted
weight interpolation. Defaults to None implying
no adjustment for positions in text.
weight_multipliers: weight multipliers.
Returns:
List[Dict]: clusters and their prominence.
Examples:
...
"""
# handle trivial case (empty list)
if len(clusters) == 0:
if to_dataframe:
return pd.DataFrame()
else:
return []
# validate inputs
if weight_position is not None:
assert 0 <= weight_position <= 1, "choose 'weight_position' between 0 and 1"
if weight_multipliers is not None:
assert len(weight_multipliers) == len(
clusters
), "Multipliers must have same length as number of entities"
else:
weight_multipliers = float(1)
clusters = pd.DataFrame.from_dict(clusters)
prominence = clusters.copy()
prominence_score = float(1)
# adjust prominence score for word positions (=offsets)
if weight_position is not None and len(clusters.start) > 1:
offset_min = min(clusters.start)
offset_max = max(clusters.start)
# linear interpolation
xp = [offset_min, offset_max]
yp = [1, weight_position]
prominence_position = np.array([np.interp(x, xp, yp) for x in clusters.start])
else:
prominence_position = float(1)
prominence_score = prominence_score * prominence_position * weight_multipliers
# aggregate prominence to group level
prominence["prominence_score"] = prominence_score
prominence = prominence.groupby(CLUSTER_ID)["prominence_score"].sum()
# rank clusters by prominence
ranks = rankdata(prominence, method="max")
ranks = max(ranks) - ranks + 1
# organize output as data frame
prominence = pd.DataFrame(prominence)
prominence["prominence_rank"] = ranks
prominence.reset_index(level=0, inplace=True)
if merge_output:
prominence = pd.merge(clusters, prominence, how="left")
if not to_dataframe:
prominence = prominence.to_dict(orient="records")
return prominence
compute_prominence_bygroup(clusters, return_first_rank=False, **kwargs)
Compute Prominence by Group
Computes prominence by entity group. Simply calls compute_prominence() groupwise.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
clusters |
List[Dict] |
Entity clusters. |
required |
kwargs |
all optional arguments for compute_prominence(). |
{} |
Returns:
Type | Description |
---|---|
List[Dict] |
entity clusters with prominence scores. |
Source code in fuzzup/fuzz.py
def compute_prominence_bygroup(
clusters: List[Dict], return_first_rank: bool = False, **kwargs
) -> List[Dict]:
"""Compute Prominence by Group
Computes prominence by entity group. Simply
calls compute_prominence() groupwise.
Args:
clusters (List[Dict]): Entity clusters.
kwargs: all optional arguments for
compute_prominence().
Returns:
List[Dict]: entity clusters with
prominence scores.
"""
# handle trivial case.
if len(clusters) == 0:
return []
clusters = pd.DataFrame.from_dict(clusters)
clusters = clusters.groupby(["entity_group"])
out = [
compute_prominence(
clusters=clusters.get_group(group).to_dict(orient="records"), **kwargs
)
for group in clusters.groups
]
out = flatten(out)
# If you only want the most prominent entities returned, pop all entities that are not the most prominent
if return_first_rank:
out = [i for i in out if i["prominence_rank"] == 1]
return out
compute_prominence_placement(clusters, placement_col='placement', wgt_body=1.0, wgt_lead=2.0, wgt_title=3.0, bygroup=False, **kwargs)
Compute Prominence from Article Placement
Parameters:
Name | Type | Description | Default |
---|---|---|---|
clusters |
list |
NER predictions. |
required |
placement_col |
str |
Name of column containing article placement of entities. Defaults to "placement". |
'placement' |
wgt_body |
float |
Weight of entities in body text. Defaults to 1.0. |
1.0 |
wgt_lead |
float |
Weight of entities in lead text. Defaults to 2.0. |
2.0 |
wgt_title |
float |
Weight of entities in title. Defaults to 3.0. |
3.0 |
bygroup |
bool |
use compute_prominence_bygroup() in stead of compute_prominence()? Defaults to True. |
False |
kwargs |
all optional arguments for compute_prominence(bygroup). |
{} |
Returns:
Type | Description |
---|---|
list |
predictions with prominence scores. |
Source code in fuzzup/fuzz.py
def compute_prominence_placement(
clusters: list,
placement_col: str = "placement",
wgt_body: float = 1.0,
wgt_lead: float = 2.0,
wgt_title: float = 3.0,
bygroup: bool = False,
**kwargs,
) -> list:
"""Compute Prominence from Article Placement
Args:
clusters (list): NER predictions.
placement_col (str, optional): Name of column containing article
placement of entities. Defaults to "placement".
wgt_body (float, optional): Weight of entities in body
text. Defaults to 1.0.
wgt_lead (float, optional): Weight of entities in lead
text. Defaults to 2.0.
wgt_title (float, optional): Weight of entities in title.
Defaults to 3.0.
bygroup (bool, optional): use compute_prominence_bygroup()
in stead of compute_prominence()? Defaults to True.
kwargs: all optional arguments for compute_prominence(bygroup).
Returns:
list: predictions with prominence scores.
"""
if len(clusters) == 0:
return []
assert all(
[placement_col in x for x in clusters]
), f"key {placement_col} must be present in all dicts"
weights = {"body": wgt_body, "lead": wgt_lead, "title": wgt_title}
multipliers = np.array([weights.get(x.get(placement_col)) for x in clusters])
if bygroup:
prominence_function = compute_prominence_bygroup
else:
prominence_function = compute_prominence
clusters = prominence_function(clusters, weight_multipliers=multipliers, **kwargs)
return clusters
fuzzy_cluster(words, cutoff=70, to_dataframe=False, merge_output=True, **kwargs)
summary
Parameters:
Name | Type | Description | Default |
---|---|---|---|
words |
List[Dict] |
Words/entities for clustering. |
required |
cutoff |
int |
Cutoff threshold value for fuzzy ratios when forming clusters. Defaults to 70. |
70 |
to_dataframe |
bool |
Output as dataframe? Defaults to True. |
False |
merge_output |
bool |
Merge output with original input? Defaults to False. |
True |
Returns:
Type | Description |
---|---|
List[Dict] |
Clusters of entities. |
Source code in fuzzup/fuzz.py
def fuzzy_cluster(
words: List[Dict],
cutoff: int = 70,
to_dataframe: bool = False,
merge_output: bool = True,
**kwargs,
) -> List[Dict]:
"""_summary_
Args:
words (List[Dict]): Words/entities for clustering.
cutoff (int, optional): Cutoff threshold value for fuzzy
ratios when forming clusters. Defaults to 70.
to_dataframe (bool, optional): Output as dataframe?
Defaults to True.
merge_output (bool, optional): Merge output with
original input? Defaults to False.
Returns:
List[Dict]: Clusters of entities.
"""
assert isinstance(words, list), "'words' must be a list"
# Remove existing cluster_id entries in words
for word in words:
if "cluster_id" in word:
del word["cluster_id"]
# handle trivial case (empty list)
if not words:
if to_dataframe:
return pd.DataFrame()
else:
return []
if isinstance(words, list) and all([isinstance(x, dict) for x in words]):
output_ner = True
strings = [x.get("word") for x in words]
else:
output_ner = False
strings = words
# compute fuzzy ratios
fuzzy_matrix = compute_fuzzy_matrix(strings, **kwargs)
clusters = naive_cluster(fuzzy_matrix, cutoff=cutoff)
# generate cluster ids (longest entity variation).
cluster_ids = [max(cluster, key=len) for cluster in clusters]
# organize output properly (for compatibility with transformers NER pipeline)
output = []
for idx, cluster in enumerate(clusters):
output.append(
pd.DataFrame.from_dict({"word": cluster, CLUSTER_ID: cluster_ids[idx]})
)
output = pd.concat(output, ignore_index=True)
# merge output with original input
if output_ner and merge_output:
output = pd.merge(pd.DataFrame.from_dict(words), output, how="left")
if not to_dataframe:
output = output.to_dict(orient="records")
return output
fuzzy_cluster_bygroup(words, **kwargs)
Fuzzy Cluster By Group
Fuzzy clustering by entity group. Simply calls fuzzy_cluster() groupwise.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
words |
List[Dict] |
Words/entities. |
required |
kwargs |
all optional arguments for fuzzy_cluster(). |
{} |
Returns:
Type | Description |
---|---|
List[Dict] |
entity clusters. |
Source code in fuzzup/fuzz.py
def fuzzy_cluster_bygroup(words: List[Dict], **kwargs) -> List[Dict]:
"""Fuzzy Cluster By Group
Fuzzy clustering by entity group. Simply
calls fuzzy_cluster() groupwise.
Args:
words (List[Dict]): Words/entities.
kwargs: all optional arguments for
fuzzy_cluster().
Returns:
List[Dict]: entity clusters.
"""
# handle trivial case.
if len(words) == 0:
return []
words = pd.DataFrame.from_dict(words)
words = words.groupby(["entity_group"])
out = [
fuzzy_cluster(words=words.get_group(group).to_dict(orient="records"), **kwargs)
for group in words.groups
]
out = flatten(out)
return out
naive_cluster(fuzzy_matrix, cutoff=70, **kwargs)
Naive Clustering
Conducts naive clustering based on matrix with pairwise correlations, fuzzy ratios etc.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fuzzy_matrix |
pd.DataFrame |
Matrix with pairwise fuzzy ratios between words. |
required |
cutoff |
float |
Threshold for naive clustering algorithm with respect to pairwise fuzzy ratios. Defaults to 70. |
70 |
Returns:
Type | Description |
---|---|
list |
resulting clusters. |
Source code in fuzzup/fuzz.py
def naive_cluster(fuzzy_matrix: pd.DataFrame, cutoff: float = 70, **kwargs) -> list:
"""Naive Clustering
Conducts naive clustering based on matrix with
pairwise correlations, fuzzy ratios etc.
Args:
fuzzy_matrix (pd.DataFrame): Matrix with
pairwise fuzzy ratios between words.
cutoff (float, optional): Threshold for naive
clustering algorithm with respect to
pairwise fuzzy ratios. Defaults to 70.
Returns:
list: resulting clusters.
"""
m = fuzzy_matrix
clusters = []
while len(m) > 0:
var = [m.index.tolist()[0]]
cluster, m = helper_clustering(m, var, cutoff=cutoff)
clusters.append(cluster)
return clusters
simulate_ner_data()
Simulate NER data
Returns:
Type | Description |
---|---|
List[Dict] |
Simulated NER data. |
Source code in fuzzup/datasets.py
def simulate_ner_data() -> List[Dict]:
"""Simulate NER data
Returns:
List[Dict]: Simulated NER data.
"""
PERSONS = ['Donald Trump', 'Donald Trump', 'J. biden', 'joe biden', 'Biden', 'Bide', 'mark esper', 'Christopher c . miller', 'jim mattis', 'Nancy Pelosi', 'trumps', 'Trump', 'Donald', 'miller']
# align format with output from Hugging Face `transformers` pipeline
n = len(PERSONS)
PERSONS_NER = pd.DataFrame(data = PERSONS, columns=['word'])
PERSONS_NER["entity_group"] = "PER"
PERSONS_NER["score"] = np.random.sample(n)
PERSONS_NER["start"] = np.random.randint(100, size=n)
PERSONS_NER["end"] = np.random.randint(100, size=n)
placements = ["title", "lead", "body"]
PERSONS_NER["placement"] = random.choices(placements, k=n)
PERSONS_NER = PERSONS_NER.to_dict(orient="records")
return PERSONS_NER
Cities (Whitelist)
Danish Cities
Whitelist of names of Danish cities initialized from the DAWA API.
Source code in fuzzup/whitelists.py
class Cities(Whitelist):
"""Danish Cities
Whitelist of names of Danish cities
initialized from the DAWA API.
"""
def __init__(self, **kwargs):
super().__init__(
function_load=get_eblocal_byer, title="city", entity_group=["LOC"], **kwargs
)
EBLocalNames (Whitelist)
EB Local Names
Whitelist with Ekstra Bladet Local Names.
Source code in fuzzup/whitelists.py
class EBLocalNames(Whitelist):
"""EB Local Names
Whitelist with Ekstra Bladet Local Names.
"""
def __init__(self, **kwargs):
super().__init__(
function_load=get_eblocal_names,
title="eblocal_name",
entity_group=["LOC"],
**kwargs,
)
Municipalities (Whitelist)
Danish Cities
Whitelist of names of Danish Municipalities initialized from the DAWA API.
Source code in fuzzup/whitelists.py
class Municipalities(Whitelist):
"""Danish Cities
Whitelist of names of Danish Municipalities
initialized from the DAWA API.
"""
def __init__(self, **kwargs):
super().__init__(
function_load=get_municipalities,
title="municipality",
entity_group=["LOC"],
**kwargs,
)
Neighborhoods (Whitelist)
Danish Neighborhoods
Whitelist of names of Danish Neighborhoods initialized from the DAWA API.
Source code in fuzzup/whitelists.py
class Neighborhoods(Whitelist):
"""Danish Neighborhoods
Whitelist of names of Danish Neighborhoods
initialized from the DAWA API.
"""
def __init__(self, **kwargs):
super().__init__(
function_load=get_eblocal_neighborhoods,
title="neighborhood",
entity_group=["LOC"],
**kwargs,
)
Whitelist
Whitelist
Whitelist objects containing whitelists and relevant meta data regarding how to apply it.
Attributes:
Name | Type | Description |
---|---|---|
entity_group |
str |
the entity group of interest for the given whitelist. |
title |
str |
title of the type of entity the whitelist relates to. |
whitelist |
dict |
whitelist with keys to match with. The values contain mappings for the given key. |
Source code in fuzzup/whitelists.py
class Whitelist:
"""Whitelist
Whitelist objects containing whitelists and
relevant meta data regarding how to apply
it.
Attributes:
entity_group (str): the entity group of interest
for the given whitelist.
title (str): title of the type of entity the
whitelist relates to.
whitelist (dict): whitelist with keys to
match with. The values contain mappings
for the given key.
"""
def __init__(self, function_load, title, entity_group, **kwargs) -> None:
self.entity_group = entity_group
self.title = title
logger.info(f"Loading whitelist: {title}")
self.whitelist = function_load(**kwargs)
logger.info("Done loading.")
def __call__(self, words: List[Dict], **kwargs) -> List[Dict]:
out = match_whitelist(
words=words,
whitelist=self.whitelist,
entity_group=self.entity_group,
**kwargs,
)
return out
apply_whitelists(whitelists, clusters, **kwargs)
Apply Multiple Whitelists
Parameters:
Name | Type | Description | Default |
---|---|---|---|
whitelists |
List[Whitelist] |
Whitelists. |
required |
clusters |
List[Dict] |
Results from fuzzy clustering etc. |
required |
kwargs |
all optional arguments for whitelist matching. |
{} |
Returns:
Type | Description |
---|---|
Dict |
output from whitelist applications. |
Source code in fuzzup/whitelists.py
def apply_whitelists(
whitelists: List[Whitelist],
clusters: List[Dict],
**kwargs,
) -> pd.DataFrame:
"""Apply Multiple Whitelists
Args:
whitelists (List[Whitelist]): Whitelists.
clusters (List[Dict]): Results from fuzzy clustering etc.
kwargs: all optional arguments for whitelist matching.
Returns:
Dict: output from whitelist applications.
"""
out = {wl.title: wl(clusters, **kwargs) for wl in whitelists}
return out
format_output(results, columns=['neighborhood_code', 'city_code', 'municipality_code'], drop_duplicates=True)
Format Output
Formats output from whitelist format by extracting only specific columns and converting them to a pandas DataFrame.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
results |
List[Dict] |
Results from Fuzzy Clustering. |
required |
columns |
List[str] |
Desired columns to extract. Defaults to ['neighborhood_code', 'city_code', 'municipality_code']. |
['neighborhood_code', 'city_code', 'municipality_code'] |
drop_duplicates |
bool |
Drop duplicate matches? Defaults to True. |
True |
Returns:
Type | Description |
---|---|
pd.DataFrame |
Output in desired format. |
Source code in fuzzup/whitelists.py
def format_output(
results: List[Dict],
columns: List[str] = ["neighborhood_code", "city_code", "municipality_code"],
drop_duplicates: bool = True,
) -> pd.DataFrame:
"""Format Output
Formats output from whitelist format by extracting
only specific columns and converting them to
a pandas DataFrame.
Args:
results (List[Dict]): Results from Fuzzy Clustering.
columns (List[str], optional): Desired columns
to extract. Defaults to
['neighborhood_code', 'city_code',
'municipality_code'].
drop_duplicates (bool, optional): Drop duplicate
matches? Defaults to True.
Returns:
pd.DataFrame: Output in desired format.
"""
results = [format_helper(x=results.get(x), columns=columns) for x in results]
results = pd.concat(results, ignore_index=True)
if drop_duplicates:
results.drop_duplicates(inplace=True, keep="first")
return results
get_neighborhoods()
Get all neighborhoods in DK
Source code in fuzzup/whitelists.py
def get_neighborhoods():
"""Get all neighborhoods in DK"""
url = "https://api.dataforsyningen.dk/steder?hovedtype=Bebyggelse&undertype=bydel"
hoods = requests.get(url).json()
out = {hood["primærtnavn"]: {"eblocal_code": hood["id"]} for hood in hoods}
return out
get_politicians()
copy pasta from https://github.com/cfblaeb/politik
Source code in fuzzup/whitelists.py
def get_politicians():
"""
copy pasta from https://github.com/cfblaeb/politik
"""
# ft.dk only allows 100 rows per call
table = "Aktør"
url = f"http://oda.ft.dk/api/{table}"
totalcount = int(
requests.get(url, params={"$inlinecount": "allpages"}).json()["odata.count"]
)
ccount = 0
print(f"# records: {totalcount}")
results = []
while ccount < totalcount:
r = requests.get(url, params={"$skip": ccount})
for row in r.json()["value"]:
if row.get("typeid") == 5: # Type_ID 5 = Politiker i folketinget.
if all(
[
row.get("slutdato") is None,
row.get("startdato") is not None,
row.get("fornavn") is not None,
row.get("efternavn") is not None,
]
):
results.append(row)
else:
pass
ccount += 100
if ccount % 1000 == 0:
print(f"# records processed: {ccount}/{totalcount}")
print(f"Number of politicians identified: {len(results)}")
# extract names
names = [x.get("fornavn") + " " + x.get("efternavn") for x in results]
names = [clean_string(x).strip() for x in names]
names.sort()
# convert to fuzzup dict format
names = {x: {} for x in names}
return names
match_whitelist(words, whitelist, score_cutoff=80, to_dataframe=False, aggregate_cluster=False, individual_wl_match=True, match_strategy=False, entity_group=None, **kwargs)
Match entities with white list
Parameters:
Name | Type | Description | Default |
---|---|---|---|
words |
List[Dict] |
words/entities for matching. |
required |
whitelist |
List[str] |
white list with words/entities to match with. |
required |
score_cutoff |
float |
Cutoff threshold value for matching. Defaults to 80. |
80 |
to_dataframe |
bool |
Return output as data frame. Defaults to False. |
False |
aggregate_cluster |
bool |
Aggregate matches to cluster level. Defaults to False. |
False |
kwargs |
optinal arguments for cdist. |
{} |
|
entity_group |
List[str] |
which entity groups to match. |
None |
Returns:
Type | Description |
---|---|
List[Dict] |
words and their respective matches with the white list. |
Source code in fuzzup/whitelists.py
def match_whitelist(
words: List[Dict],
whitelist: List[str],
score_cutoff: float = 80,
to_dataframe: bool = False,
aggregate_cluster: bool = False,
individual_wl_match: bool = True,
match_strategy: bool = False,
entity_group: List[str] = None,
**kwargs,
) -> List[Dict]:
"""Match entities with white list
Args:
words (List[Dict]): words/entities for matching.
whitelist (List[str]): white list with words/entities
to match with.
score_cutoff (float, optional): Cutoff threshold value for
matching. Defaults to 80.
to_dataframe (bool, optional): Return output as data frame.
Defaults to False.
aggregate_cluster (bool, optional): Aggregate matches to
cluster level. Defaults to False.
kwargs: optinal arguments for cdist.
entity_group: which entity groups to match.
Returns:
List[Dict]: words and their respective matches with the
white list.
"""
assert isinstance(words, list), "'words' must be a list"
assert isinstance(whitelist, (list, dict)), "'whitelist' must be a list or dit"
def filter_rank_df(
df_filter: pd.DataFrame,
df: pd.DataFrame,
rank_limit: int = 1,
min_count: int = 2,
) -> pd.DataFrame:
rank_list = df_filter.query(
"count >= 2 and prominence_rank == 2 or prominence_rank==1"
).cluster_id.tolist()
df = df.query("cluster_id in @rank_list")
return df
def count_word_prominence_freq(strings: List[str]) -> pd.Series:
"""
This function will take a list of strings, compare it to their prominence rank and
count, in order to decide which strings should be returned.
"""
return pd.value_counts(np.array(strings))
def convert_version_list_to_set(row: pd.Series) -> pd.Series:
"""This method will convert any list of version numbers to a set of unique versions"""
row["versions"] = set(row["versions"].tolist())
return row
# if the whitelist is a dictionary, then generate a list of keys for later use
is_dict = False
if isinstance(whitelist, dict):
is_dict = True
whitelist_dict = whitelist
whitelist = list(whitelist.keys())
whitelist_versions = [x.get("version") for x in whitelist_dict.values()]
# handle trivial case (empty list)
if not words or not whitelist:
if to_dataframe:
return pd.DataFrame()
else:
return []
# if all words are dictionaries, it is assumed to be NER format
if isinstance(words, list) and all([isinstance(x, dict) for x in words]):
output_ner = True
if entity_group is not None:
words = [x for x in words if x.get("entity_group") in entity_group]
strings = [x.get("word") for x in words]
else:
output_ner = False
strings = words
if len(strings) == 0:
if to_dataframe:
return pd.DataFrame()
else:
return []
# compute distances - length of the whitelist
# Only takes strings ~ so no information about prominence here.
dists = cdist(whitelist, strings, score_cutoff=score_cutoff, **kwargs)
# All matches on the whitelist ~ pass all words here to start with and then handle matches afterwards
matches = [np.array(whitelist)[np.where(col)] for col in dists.T]
if is_dict:
versions = [np.array(whitelist_versions)[np.where(col)] for col in dists.T]
if not output_ner:
df = pd.DataFrame.from_dict({"word": strings, "matches": matches})
if output_ner:
df = pd.DataFrame.from_records(words)
df["matches"] = matches
if is_dict:
df["versions"] = versions
df = df.apply(convert_version_list_to_set, axis=1)
# MATCH STRATEGY
if "prominence_rank" in df and match_strategy is True:
df_filter = (
df.groupby(["cluster_id", "prominence_rank"])["word"]
.count()
.reset_index()
.rename({"word": "count"}, axis=1)
)
# If rank 2 has an occurance of at least 2, regardless of prominence_score
if (df_filter[df_filter["prominence_rank"] == 2]["count"] >= 2).any():
df = filter_rank_df(
df_filter=df_filter, df=df, rank_limit=2, min_count=2
)
# Regress to match-strategy of returning rank 1 only
else:
df = df[df["prominence_rank"] == 1] # return first rank only
if aggregate_cluster:
matches = pd.DataFrame(
df.groupby(by=["cluster_id"]).apply(aggregate_to_cluster),
columns=["matches"],
index=None,
)
matches = matches.reset_index()
df.drop("matches", axis=1, inplace=True)
df = pd.merge(df, matches, how="left")
df["matches"] = [x.tolist() for x in df["matches"]]
if is_dict:
mappings = []
for match in df.matches.tolist():
out = [whitelist_dict.get(x) for x in match]
mappings.append(out)
df["mappings"] = mappings
# subset matches only
df = df[df["matches"].astype(str) != "[]"]
if not to_dataframe:
df = df.to_dict(orient="records")
return df