Source code for sortinghat.core.recommendations.matching

# -*- coding: utf-8 -*-
#
# Copyright (C) 2014-2021 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
#     Santiago Dueñas <sduenas@bitergia.com>
#     Miguel Ángel Fernández <mafesan@bitergia.com>
#


import logging
import pandas

from django.forms.models import model_to_dict

from ..db import (find_individual_by_uuid)
from ..errors import NotFoundError
from ..models import Identity


logger = logging.getLogger(__name__)


[docs]def recommend_matches(source_uuids, target_uuids, criteria, verbose=False):
    """Recommend identity matches for a list of individuals.

    Returns a generator of identity matches recommendations
    based on a list of criteria composed by email addresses,
    name and/or usernames of the individuals.

    The function checks if any identity from each individual
    matches with a given set of target individuals. First,
    it filters by the fields from the criteria and then it
    groups the matches by identity. Then, these groups are
    turned into sets of matching identities found among all
    the input identities set.

    Each recommendation contains the uuid of the individual
    provided in the input list and a list with the matching
    individuals or the matching identities if the `verbose`
    option is activated.

    When no matching is found for a given individual, an empty
    list is returned.

    When there are no `target_uuids`, the recommendations will
    be returned for each `source_uuids` against all identities
    on the registry.

    :param source_uuids: list of individual keys to find matches for
    :param target_uuids: list of individual keys where to find matches
    :param criteria: list of matching criteria (`email`, `name`, `username`)
    :param verbose: if set to `True`, the list of results will include individual identities. Otherwise, results will include main keys from individuals

    :returns: a generator of recommendations
    """

    def _get_identities(uuid):
        """Get the identities from a given Individual based on one of its uuids"""

        try:
            individual = find_individual_by_uuid(uuid)
        except NotFoundError:
            identities = []
        else:
            identities = individual.identities.all()

        return identities

    logger.debug(
        f"Generating matching recommendations; "
        f"source={source_uuids} target={target_uuids} criteria='{criteria}'; ..."
    )

    aliases = {}
    input_set = set()
    target_set = set()
    for uuid in source_uuids:
        identities = _get_identities(uuid)
        aliases[uuid] = [identity.uuid for identity in identities]
        input_set.update(identities)

    if target_uuids:
        for uuid in target_uuids:
            identities = _get_identities(uuid)
            target_set.update(identities)
    else:
        identities = Identity.objects.all()
        target_set.update(identities)

    matched = _find_matches(input_set, target_set, criteria, verbose=verbose)
    # Return filtered results
    for uuid in source_uuids:
        result = set()
        if uuid in matched.keys():
            result = matched[uuid]
        else:
            for alias in aliases[uuid]:
                if alias in matched.keys():
                    result = matched[alias]
        # Remove input uuid from results if needed
        try:
            result.remove(uuid)
        except KeyError:
            pass
        yield uuid, list(result)

    logger.info(f"Matching recommendations generated; criteria='{criteria}'")


def _find_matches(set_x, set_y, criteria, verbose):
    """Find identities matches between two sets using Pandas' library.

    This method find matches for the identities in `set_x` looking at
    the identities from `set_y` given a list of criteria.

    The identities sets are transformed into Pandas dataframes and
    they are filtered according to the different criteria, then
    merged and grouped by the identities from `set_x`. The grouped
    results are transformed into sets of results taking into account
    the results from the rest of results to generate complete sets
    of matches per each identity from `set_x`.

    :param set_x: list of individual keys to find matches for
    :param set_y: list of individual keys where to find matches
    :param criteria: list of matching criteria (`email`, `name`, `username`).
    :param verbose: if set to `True`, the list of results will include individual
        identities. Otherwise, results will include main keys from individuals.

    :returns: a dictionary including the set of matches found for each
        identity from `set_x`.
    """
    def _to_df(data_set):
        """Convert identities set into a Pandas `Dataframe` object"""
        df = pandas.DataFrame(data_set)
        return df.sort_values(['individual'])

    def _filter_criteria(df, c):
        """Filter dataframe creating a basic subset including a given column"""
        cols = ['uuid', 'individual', c]
        cdf = df[cols]
        return cdf.dropna(subset=[c])

    def _calculate_matches_groups(grouped_uids, verbose=False):
        """Calculate groups of matching identities from identity groups.

        For instance, given a list of matched unique identities like
        A = {A, B}; B = {B,A,C}, C = {C,} and D = {D,} the output
        for keys A, B and C will be the set {A, B, C}. As D has no matches,
        it won't be included in any group and it won't be returned.

        :param grouped_uids: groups of unique identities
        :param verbose: if true, the grouping will be calculated using
            unique identities (uuids) instead of main keys from individuals.

        :returns: a dictionary including the set of matches for each
            group key.
        """
        matches = {}
        # Group by main keys from Individuals or by uuids from Identities
        col_name = 'uuid_y' if verbose else 'individual_y'

        sorted_keys = sorted(grouped_uids.groups.keys())

        while sorted_keys:
            group_key = sorted_keys.pop(0)
            uuid_set = set()
            for uuid in grouped_uids.get_group(group_key)[col_name]:
                uuid_set.add(uuid)

            for key in matches:
                prev_match = matches[key]
                if prev_match == uuid_set:
                    continue
                elif prev_match.intersection(uuid_set):
                    prev_match.update(uuid_set)
                    uuid_set = prev_match
            matches[group_key] = uuid_set

        return matches

    data_x = [model_to_dict(fl) for fl in set_x]
    data_y = [model_to_dict(fl) for fl in set_y]

    if (not data_x) or (not data_y):
        return {}

    df_x = _to_df(data_x)
    df_y = _to_df(data_y)

    cdfs = []

    for c in criteria:
        cdf_x = _filter_criteria(df_x, c)
        cdf_y = _filter_criteria(df_y, c)
        cdf = pandas.merge(cdf_x, cdf_y, on=c, how='inner')
        cdf = cdf[['individual_y', 'uuid_x', 'uuid_y']]
        cdfs.append(cdf)

    result = pandas.concat(cdfs)
    result = result.drop_duplicates()

    g_result = result.groupby(by=['uuid_x'],
                              as_index=True, sort=True)

    matched = _calculate_matches_groups(g_result, verbose=verbose)

    return matched