{-| Module : Gargantext.Database.Flow Description : Database Flow Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE Arrows #-} module Gargantext.Database.Action.Flow.Pairing -- (pairing) where import Debug.Trace (trace) import Control.Lens (_Just, (^.), view) import Data.Hashable (Hashable) import Data.HashMap.Strict (HashMap) import Data.Maybe (fromMaybe, catMaybes) import Data.Set (Set) import Data.Text (Text) import Gargantext.API.Ngrams.Tools import Gargantext.API.Ngrams.Types (NgramsTerm(..)) import Gargantext.API.Prelude (GargNoServer) import Gargantext.Core import Gargantext.Core.Text.Metrics.CharByChar (levenshtein) import Gargantext.Core.Types (TableResult(..)) import Gargantext.Core.Types.Main import Gargantext.Database import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsOnlyUser) import Gargantext.Database.Admin.Config import Gargantext.Database.Admin.Types.Hyperdata -- (HyperdataContact(..)) import Gargantext.Database.Admin.Types.Node -- (AnnuaireId, CorpusId, ListId, DocId, ContactId, NodeId) import Gargantext.Database.Query.Prelude (returnA, queryNodeNodeTable) import Gargantext.Database.Query.Table.Node (defaultList) import Gargantext.Database.Query.Table.Node.Children (getAllContacts) import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername) import Gargantext.Database.Query.Table.NodeContext_NodeContext (insertNodeContext_NodeContext) import Gargantext.Database.Query.Table.NodeNode (insertNodeNode) import Gargantext.Database.Schema.Ngrams -- (NgramsType(..)) import Gargantext.Database.Schema.Node -- import Gargantext.Database.Schema.Context import qualified Data.HashMap.Strict as HM import Gargantext.Prelude hiding (sum) import Opaleye import qualified Data.HashMap.Strict as HashMap import qualified Data.List as List import qualified Data.Set as Set import qualified Data.Text as Text -- | isPairedWith -- All NodeAnnuaire paired with a Corpus of NodeId nId: -- isPairedWith NodeAnnuaire corpusId isPairedWith :: NodeId -> NodeType -> Cmd err [NodeId] isPairedWith nId nt = runOpaQuery (selectQuery nt nId) where selectQuery :: NodeType -> NodeId -> Select (Column SqlInt4) selectQuery nt' nId' = proc () -> do node <- queryNodeTable -< () node_node <- optionalRestrict queryNodeNodeTable -< \node_node' -> (node ^. node_id) .== (node_node' ^. nn_node2_id) restrict -< (node^.node_typename) .== sqlInt4 (toDBid nt') restrict -< (view nn_node1_id <$> node_node) .=== justFields (pgNodeId nId') returnA -< node^.node_id ----------------------------------------------------------------------- pairing :: AnnuaireId -> CorpusId -> Maybe ListId -> GargNoServer [Int] pairing a c l' = do l <- case l' of Nothing -> defaultList c Just l'' -> pure l'' dataPaired <- dataPairing a (c,l,Authors) _ <- insertNodeNode [ NodeNode c a Nothing Nothing] insertNodeContext_NodeContext $ prepareInsert c a dataPaired dataPairing :: AnnuaireId -> (CorpusId, ListId, NgramsType) -> GargNoServer (HashMap ContactId (Set DocId)) dataPairing aId (cId, lId, ngt) = do -- mc :: HM.HashMap ContactName (Set ContactId) mc <- getNgramsContactId aId -- md :: HM.HashMap DocAuthor (Set DocId) md <- getNgramsDocId cId lId ngt -- printDebug "dataPairing authors" (HM.keys md) let result = fusion mc md -- printDebug "dataPairing" (length $ HM.keys result) pure result prepareInsert :: CorpusId -> AnnuaireId -> HashMap ContactId (Set DocId) -> [(CorpusId, AnnuaireId, DocId, ContactId)] prepareInsert corpusId annuaireId mapContactDocs = map (\(contactId,docId) -> (corpusId, docId, annuaireId, contactId)) $ List.concat $ map (\(contactId, setDocIds) -> map (\setDocId -> (contactId, setDocId) ) $ Set.toList setDocIds ) $ HM.toList mapContactDocs ------------------------------------------------------------------------ type ContactName = NgramsTerm type DocAuthor = NgramsTerm type Projected = NgramsTerm fusion :: HashMap ContactName (Set ContactId) -> HashMap DocAuthor (Set DocId) -> HashMap ContactId (Set DocId) fusion mc md = HM.fromListWith (<>) $ List.concat $ map (\(docAuthor, docs) -> case (getClosest Text.toLower docAuthor (HM.keys mc)) of Nothing -> [] Just author -> case HM.lookup author mc of Nothing -> [] Just contactIds -> map (\contactId -> (contactId, docs)) $ Set.toList contactIds ) $ HM.toList md fusion'' :: HashMap ContactName (Set ContactId) -> HashMap DocAuthor (Set DocId) -> HashMap ContactId (Set DocId) fusion'' mc md = hashmapReverse $ fusion' mc (hashmapReverse md) fusion' :: HashMap ContactName (Set ContactId) -> HashMap DocId (Set DocAuthor) -> HashMap DocId (Set ContactId) fusion' mc md = HM.fromListWith (<>) $ map (\(docId, setAuthors) -> (docId, getContactIds mc $ getClosest' setAuthors (HM.keys mc))) $ HM.toList md getContactIds :: HashMap ContactName (Set ContactId) -> Set ContactName -> Set ContactId getContactIds mapContactNames contactNames = if Set.null contactNames then Set.empty else Set.unions $ catMaybes $ map (\contactName -> HM.lookup contactName mapContactNames) $ Set.toList contactNames getClosest' :: Set DocAuthor -> [ContactName] -> Set ContactName getClosest' setAuthors contactNames = trace (show (setAuthors, setContactNames)) $ setContactNames where setContactNames = if Set.null xs then ys else xs xs = Set.fromList $ catMaybes $ map (\author -> getClosest Text.toLower author contactNames) $ Set.toList setAuthors ys = Set.fromList $ catMaybes $ map (\(NgramsTerm author) -> case ((lastMay . (Text.splitOn " ")) author) of Nothing -> Nothing Just authorReduced -> getClosest Text.toLower (NgramsTerm authorReduced) contactNames) $ Set.toList setAuthors getClosest :: (Text -> Text) -> NgramsTerm -> [NgramsTerm] -> Maybe NgramsTerm getClosest f (NgramsTerm from) candidates = fst <$> head scored where scored = List.sortOn snd $ List.filter (\(_,score) -> score <= 2) $ map (\cand@(NgramsTerm candidate) -> (cand, levenshtein (f from) (f candidate))) candidates ------------------------------------------------------------------------ getNgramsContactId :: AnnuaireId -> Cmd err (HashMap ContactName (Set NodeId)) getNgramsContactId aId = do contacts <- getAllContacts aId -- printDebug "getAllContexts" (tr_count contacts) let paired= HM.fromListWith (<>) $ map (\contact -> (toName contact, Set.singleton (contact^.node_id)) ) (tr_docs contacts) -- printDebug "paired" (HM.keys paired) pure paired -- POC here, should be a probabilistic function (see the one used to find lang) toName :: Node HyperdataContact -> NgramsTerm -- toName contact = NgramsTerm $ (Text.toTitle $ Text.take 1 firstName) <> ". " <> (Text.toTitle lastName) toName contact = NgramsTerm $ (Text.toTitle firstName) <> " " <> (Text.toTitle lastName) where firstName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_firstName) lastName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_lastName) getNgramsDocId :: CorpusId -> ListId -> NgramsType -> GargNoServer (HashMap DocAuthor (Set NodeId)) getNgramsDocId cId lId nt = do lIds <- selectNodesWithUsername NodeList userMaster repo <- getRepo (lId:lIds) let ngs = filterListWithRoot [MapTerm, CandidateTerm] $ mapTermListRoot (lId:lIds) nt repo -- printDebug "getNgramsDocId" ngs groupNodesByNgrams ngs <$> getContextsByNgramsOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs) hashmapReverse :: (Ord a, Eq b, Hashable b) => HashMap a (Set b) -> HashMap b (Set a) hashmapReverse m = HM.fromListWith (<>) $ List.concat $ map (\(k,vs) -> [ (v, Set.singleton k) | v <- Set.toList vs]) $ HM.toList m