Source code for sortinghat.core.recommendations.matching

# -*- coding: utf-8 -*-
#
# Copyright (C) 2014-2021 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
#     Santiago Dueñas <sduenas@bitergia.com>
#     Miguel Ángel Fernández <mafesan@bitergia.com>
#


import logging
import pandas

from django.forms.models import model_to_dict

from ..db import (find_individual_by_uuid)
from ..errors import NotFoundError
from ..models import Identity


logger = logging.getLogger(__name__)


[docs]def recommend_matches(source_uuids, target_uuids, criteria, verbose=False): """Recommend identity matches for a list of individuals. Returns a generator of identity matches recommendations based on a list of criteria composed by email addresses, name and/or usernames of the individuals. The function checks if any identity from each individual matches with a given set of target individuals. First, it filters by the fields from the criteria and then it groups the matches by identity. Then, these groups are turned into sets of matching identities found among all the input identities set. Each recommendation contains the uuid of the individual provided in the input list and a list with the matching individuals or the matching identities if the `verbose` option is activated. When no matching is found for a given individual, an empty list is returned. When there are no `target_uuids`, the recommendations will be returned for each `source_uuids` against all identities on the registry. :param source_uuids: list of individual keys to find matches for :param target_uuids: list of individual keys where to find matches :param criteria: list of matching criteria (`email`, `name`, `username`) :param verbose: if set to `True`, the list of results will include individual identities. Otherwise, results will include main keys from individuals :returns: a generator of recommendations """ def _get_identities(uuid): """Get the identities from a given Individual based on one of its uuids""" try: individual = find_individual_by_uuid(uuid) except NotFoundError: identities = [] else: identities = individual.identities.all() return identities logger.debug( f"Generating matching recommendations; " f"source={source_uuids} target={target_uuids} criteria='{criteria}'; ..." ) aliases = {} input_set = set() target_set = set() for uuid in source_uuids: identities = _get_identities(uuid) aliases[uuid] = [identity.uuid for identity in identities] input_set.update(identities) if target_uuids: for uuid in target_uuids: identities = _get_identities(uuid) target_set.update(identities) else: identities = Identity.objects.all() target_set.update(identities) matched = _find_matches(input_set, target_set, criteria, verbose=verbose) # Return filtered results for uuid in source_uuids: result = set() if uuid in matched.keys(): result = matched[uuid] else: for alias in aliases[uuid]: if alias in matched.keys(): result = matched[alias] # Remove input uuid from results if needed try: result.remove(uuid) except KeyError: pass yield uuid, list(result) logger.info(f"Matching recommendations generated; criteria='{criteria}'")
def _find_matches(set_x, set_y, criteria, verbose): """Find identities matches between two sets using Pandas' library. This method find matches for the identities in `set_x` looking at the identities from `set_y` given a list of criteria. The identities sets are transformed into Pandas dataframes and they are filtered according to the different criteria, then merged and grouped by the identities from `set_x`. The grouped results are transformed into sets of results taking into account the results from the rest of results to generate complete sets of matches per each identity from `set_x`. :param set_x: list of individual keys to find matches for :param set_y: list of individual keys where to find matches :param criteria: list of matching criteria (`email`, `name`, `username`). :param verbose: if set to `True`, the list of results will include individual identities. Otherwise, results will include main keys from individuals. :returns: a dictionary including the set of matches found for each identity from `set_x`. """ def _to_df(data_set): """Convert identities set into a Pandas `Dataframe` object""" df = pandas.DataFrame(data_set) return df.sort_values(['individual']) def _filter_criteria(df, c): """Filter dataframe creating a basic subset including a given column""" cols = ['uuid', 'individual', c] cdf = df[cols] return cdf.dropna(subset=[c]) def _calculate_matches_groups(grouped_uids, verbose=False): """Calculate groups of matching identities from identity groups. For instance, given a list of matched unique identities like A = {A, B}; B = {B,A,C}, C = {C,} and D = {D,} the output for keys A, B and C will be the set {A, B, C}. As D has no matches, it won't be included in any group and it won't be returned. :param grouped_uids: groups of unique identities :param verbose: if true, the grouping will be calculated using unique identities (uuids) instead of main keys from individuals. :returns: a dictionary including the set of matches for each group key. """ matches = {} # Group by main keys from Individuals or by uuids from Identities col_name = 'uuid_y' if verbose else 'individual_y' sorted_keys = sorted(grouped_uids.groups.keys()) while sorted_keys: group_key = sorted_keys.pop(0) uuid_set = set() for uuid in grouped_uids.get_group(group_key)[col_name]: uuid_set.add(uuid) for key in matches: prev_match = matches[key] if prev_match == uuid_set: continue elif prev_match.intersection(uuid_set): prev_match.update(uuid_set) uuid_set = prev_match matches[group_key] = uuid_set return matches data_x = [model_to_dict(fl) for fl in set_x] data_y = [model_to_dict(fl) for fl in set_y] if (not data_x) or (not data_y): return {} df_x = _to_df(data_x) df_y = _to_df(data_y) cdfs = [] for c in criteria: cdf_x = _filter_criteria(df_x, c) cdf_y = _filter_criteria(df_y, c) cdf = pandas.merge(cdf_x, cdf_y, on=c, how='inner') cdf = cdf[['individual_y', 'uuid_x', 'uuid_y']] cdfs.append(cdf) result = pandas.concat(cdfs) result = result.drop_duplicates() g_result = result.groupby(by=['uuid_x'], as_index=True, sort=True) matched = _calculate_matches_groups(g_result, verbose=verbose) return matched