2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE QuasiQuotes #-}
13 {-# LANGUAGE Arrows #-}
15 module Gargantext.Database.Action.Flow.Pairing
19 import Debug.Trace (trace)
20 import Control.Lens (_Just, (^.))
21 import Data.Hashable (Hashable)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Maybe (fromMaybe, catMaybes)
25 import Data.Text (Text)
26 import Gargantext.API.Ngrams.Tools
27 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
28 import Gargantext.API.Prelude (GargNoServer)
29 import Gargantext.Core
30 import Gargantext.Core.Text.Metrics.CharByChar (levenshtein)
31 import Gargantext.Core.Types (TableResult(..))
32 import Gargantext.Core.Types.Main
33 import Gargantext.Database
34 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsOnlyUser)
35 import Gargantext.Database.Admin.Config
36 import Gargantext.Database.Admin.Types.Hyperdata -- (HyperdataContact(..))
37 import Gargantext.Database.Admin.Types.Node -- (AnnuaireId, CorpusId, ListId, DocId, ContactId, NodeId)
38 import Gargantext.Database.Query.Prelude (leftJoin2, returnA, queryNodeNodeTable)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Children (getAllContacts)
41 import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
42 import Gargantext.Database.Query.Table.NodeContext_NodeContext (insertNodeContext_NodeContext)
43 import Gargantext.Database.Query.Table.NodeNode (insertNodeNode)
44 import Gargantext.Database.Schema.Ngrams -- (NgramsType(..))
45 import Gargantext.Database.Schema.Node
46 -- import Gargantext.Database.Schema.Context
47 import qualified Data.HashMap.Strict as HM
48 import Gargantext.Prelude hiding (sum)
50 import qualified Data.HashMap.Strict as HashMap
51 import qualified Data.List as List
52 import qualified Data.Set as Set
53 import qualified Data.Text as Text
56 -- All NodeAnnuaire paired with a Corpus of NodeId nId:
57 -- isPairedWith NodeAnnuaire corpusId
58 isPairedWith :: NodeId -> NodeType -> Cmd err [NodeId]
59 isPairedWith nId nt = runOpaQuery (selectQuery nt nId)
61 selectQuery :: NodeType -> NodeId -> Select (Column SqlInt4)
62 selectQuery nt' nId' = proc () -> do
63 (node, node_node) <- queryJoin -< ()
64 restrict -< (node^.node_typename) .== (sqlInt4 $ toDBid nt')
65 restrict -< (node_node^.nn_node1_id) .== (toNullable $ pgNodeId nId')
66 returnA -< node^.node_id
68 queryJoin :: Select (NodeRead, NodeNodeReadNull)
69 queryJoin = leftJoin2 queryNodeTable queryNodeNodeTable cond
71 cond (node, node_node) = node^.node_id .== node_node^. nn_node2_id
73 -----------------------------------------------------------------------
74 pairing :: AnnuaireId -> CorpusId -> Maybe ListId -> GargNoServer [Int]
77 Nothing -> defaultList c
79 dataPaired <- dataPairing a (c,l,Authors)
80 _ <- insertNodeNode [ NodeNode c a Nothing Nothing]
81 insertNodeContext_NodeContext $ prepareInsert c a dataPaired
84 dataPairing :: AnnuaireId
85 -> (CorpusId, ListId, NgramsType)
86 -> GargNoServer (HashMap ContactId (Set DocId))
87 dataPairing aId (cId, lId, ngt) = do
88 -- mc :: HM.HashMap ContactName (Set ContactId)
89 mc <- getNgramsContactId aId
90 -- md :: HM.HashMap DocAuthor (Set DocId)
91 md <- getNgramsDocId cId lId ngt
92 -- printDebug "dataPairing authors" (HM.keys md)
93 let result = fusion mc md
94 -- printDebug "dataPairing" (length $ HM.keys result)
98 prepareInsert :: CorpusId -> AnnuaireId -> HashMap ContactId (Set DocId)
99 -> [(CorpusId, AnnuaireId, DocId, ContactId)]
100 prepareInsert corpusId annuaireId mapContactDocs =
101 map (\(contactId,docId) -> (corpusId, docId, annuaireId, contactId))
103 $ map (\(contactId, setDocIds)
105 -> (contactId, setDocId)
106 ) $ Set.toList setDocIds
108 $ HM.toList mapContactDocs
110 ------------------------------------------------------------------------
111 type ContactName = NgramsTerm
112 type DocAuthor = NgramsTerm
113 type Projected = NgramsTerm
115 fusion :: HashMap ContactName (Set ContactId)
116 -> HashMap DocAuthor (Set DocId)
117 -> HashMap ContactId (Set DocId)
118 fusion mc md = HM.fromListWith (<>)
120 $ map (\(docAuthor, docs)
121 -> case (getClosest Text.toLower docAuthor (HM.keys mc)) of
123 Just author -> case HM.lookup author mc of
125 Just contactIds -> map (\contactId -> (contactId, docs))
126 $ Set.toList contactIds
130 fusion'' :: HashMap ContactName (Set ContactId)
131 -> HashMap DocAuthor (Set DocId)
132 -> HashMap ContactId (Set DocId)
133 fusion'' mc md = hashmapReverse $ fusion' mc (hashmapReverse md)
136 fusion' :: HashMap ContactName (Set ContactId)
137 -> HashMap DocId (Set DocAuthor)
138 -> HashMap DocId (Set ContactId)
139 fusion' mc md = HM.fromListWith (<>)
140 $ map (\(docId, setAuthors) -> (docId, getContactIds mc $ getClosest' setAuthors (HM.keys mc)))
143 getContactIds :: HashMap ContactName (Set ContactId) -> Set ContactName -> Set ContactId
144 getContactIds mapContactNames contactNames =
145 if Set.null contactNames
147 else Set.unions $ catMaybes $ map (\contactName -> HM.lookup contactName mapContactNames) $ Set.toList contactNames
149 getClosest' :: Set DocAuthor -> [ContactName] -> Set ContactName
150 getClosest' setAuthors contactNames = trace (show (setAuthors, setContactNames)) $ setContactNames
152 setContactNames = if Set.null xs then ys else xs
153 xs = Set.fromList $ catMaybes $ map (\author -> getClosest Text.toLower author contactNames) $ Set.toList setAuthors
154 ys = Set.fromList $ catMaybes $ map (\(NgramsTerm author) -> case ((lastMay . (Text.splitOn " ")) author) of
156 Just authorReduced -> getClosest Text.toLower (NgramsTerm authorReduced) contactNames)
157 $ Set.toList setAuthors
160 getClosest :: (Text -> Text) -> NgramsTerm -> [NgramsTerm] -> Maybe NgramsTerm
161 getClosest f (NgramsTerm from) candidates = fst <$> head scored
163 scored = List.sortOn snd
164 $ List.filter (\(_,score) -> score <= 2)
165 $ map (\cand@(NgramsTerm candidate) -> (cand, levenshtein (f from) (f candidate))) candidates
168 ------------------------------------------------------------------------
169 getNgramsContactId :: AnnuaireId
170 -> Cmd err (HashMap ContactName (Set NodeId))
171 getNgramsContactId aId = do
172 contacts <- getAllContacts aId
173 -- printDebug "getAllContexts" (tr_count contacts)
174 let paired= HM.fromListWith (<>)
175 $ map (\contact -> (toName contact, Set.singleton (contact^.node_id))
177 -- printDebug "paired" (HM.keys paired)
179 -- POC here, should be a probabilistic function (see the one used to find lang)
180 toName :: Node HyperdataContact -> NgramsTerm
181 -- toName contact = NgramsTerm $ (Text.toTitle $ Text.take 1 firstName) <> ". " <> (Text.toTitle lastName)
182 toName contact = NgramsTerm $ (Text.toTitle firstName) <> " " <> (Text.toTitle lastName)
184 firstName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_firstName)
185 lastName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_lastName)
187 getNgramsDocId :: CorpusId
190 -> GargNoServer (HashMap DocAuthor (Set NodeId))
191 getNgramsDocId cId lId nt = do
192 lIds <- selectNodesWithUsername NodeList userMaster
193 repo <- getRepo (lId:lIds)
194 let ngs = filterListWithRoot [MapTerm, CandidateTerm] $ mapTermListRoot (lId:lIds) nt repo
195 -- printDebug "getNgramsDocId" ngs
197 groupNodesByNgrams ngs <$> getContextsByNgramsOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs)
199 hashmapReverse :: (Ord a, Eq b, Hashable b)
200 => HashMap a (Set b) -> HashMap b (Set a)
201 hashmapReverse m = HM.fromListWith (<>)
203 $ map (\(k,vs) -> [ (v, Set.singleton k) | v <- Set.toList vs])