2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE QuasiQuotes #-}
13 {-# LANGUAGE Arrows #-}
15 module Gargantext.Database.Action.Flow.Pairing
19 import Debug.Trace (trace)
20 import Control.Lens (_Just, (^.), view)
21 import Data.Hashable (Hashable)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Maybe (fromMaybe, catMaybes)
25 import Data.Text (Text)
26 import Gargantext.API.Ngrams.Tools
27 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
28 import Gargantext.API.Prelude (GargNoServer)
29 import Gargantext.Core
30 import Gargantext.Core.Text.Metrics.CharByChar (levenshtein)
31 import Gargantext.Core.Types (TableResult(..))
32 import Gargantext.Core.Types.Main
33 import Gargantext.Database
34 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsOnlyUser)
35 import Gargantext.Database.Admin.Config
36 import Gargantext.Database.Admin.Types.Hyperdata -- (HyperdataContact(..))
37 import Gargantext.Database.Admin.Types.Node -- (AnnuaireId, CorpusId, ListId, DocId, ContactId, NodeId)
38 import Gargantext.Database.Query.Prelude (returnA, queryNodeNodeTable)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Children (getAllContacts)
41 import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
42 import Gargantext.Database.Query.Table.NodeContext_NodeContext (insertNodeContext_NodeContext)
43 import Gargantext.Database.Query.Table.NodeNode (insertNodeNode)
44 import Gargantext.Database.Schema.Ngrams -- (NgramsType(..))
45 import Gargantext.Database.Schema.Node
46 -- import Gargantext.Database.Schema.Context
47 import qualified Data.HashMap.Strict as HM
48 import Gargantext.Prelude hiding (sum)
50 import qualified Data.HashMap.Strict as HashMap
51 import qualified Data.List as List
52 import qualified Data.Set as Set
53 import qualified Data.Text as Text
56 -- All NodeAnnuaire paired with a Corpus of NodeId nId:
57 -- isPairedWith NodeAnnuaire corpusId
58 isPairedWith :: NodeId -> NodeType -> Cmd err [NodeId]
59 isPairedWith nId nt = runOpaQuery (selectQuery nt nId)
61 selectQuery :: NodeType -> NodeId -> Select (Column SqlInt4)
62 selectQuery nt' nId' = proc () -> do
63 node <- queryNodeTable -< ()
64 node_node <- optionalRestrict queryNodeNodeTable -<
65 \node_node' -> (node ^. node_id) .== (node_node' ^. nn_node2_id)
66 restrict -< (node^.node_typename) .== sqlInt4 (toDBid nt')
67 restrict -< (view nn_node1_id <$> node_node) .=== justFields (pgNodeId nId')
68 returnA -< node^.node_id
70 -----------------------------------------------------------------------
71 pairing :: AnnuaireId -> CorpusId -> Maybe ListId -> GargNoServer [Int]
74 Nothing -> defaultList c
76 dataPaired <- dataPairing a (c,l,Authors)
77 _ <- insertNodeNode [ NodeNode c a Nothing Nothing]
78 insertNodeContext_NodeContext $ prepareInsert c a dataPaired
81 dataPairing :: AnnuaireId
82 -> (CorpusId, ListId, NgramsType)
83 -> GargNoServer (HashMap ContactId (Set DocId))
84 dataPairing aId (cId, lId, ngt) = do
85 -- mc :: HM.HashMap ContactName (Set ContactId)
86 mc <- getNgramsContactId aId
87 -- md :: HM.HashMap DocAuthor (Set DocId)
88 md <- getNgramsDocId cId lId ngt
89 -- printDebug "dataPairing authors" (HM.keys md)
90 let result = fusion mc md
91 -- printDebug "dataPairing" (length $ HM.keys result)
95 prepareInsert :: CorpusId -> AnnuaireId -> HashMap ContactId (Set DocId)
96 -> [(CorpusId, AnnuaireId, DocId, ContactId)]
97 prepareInsert corpusId annuaireId mapContactDocs =
98 map (\(contactId,docId) -> (corpusId, docId, annuaireId, contactId))
100 $ map (\(contactId, setDocIds)
102 -> (contactId, setDocId)
103 ) $ Set.toList setDocIds
105 $ HM.toList mapContactDocs
107 ------------------------------------------------------------------------
108 type ContactName = NgramsTerm
109 type DocAuthor = NgramsTerm
110 type Projected = NgramsTerm
112 fusion :: HashMap ContactName (Set ContactId)
113 -> HashMap DocAuthor (Set DocId)
114 -> HashMap ContactId (Set DocId)
115 fusion mc md = HM.fromListWith (<>)
117 $ map (\(docAuthor, docs)
118 -> case (getClosest Text.toLower docAuthor (HM.keys mc)) of
120 Just author -> case HM.lookup author mc of
122 Just contactIds -> map (\contactId -> (contactId, docs))
123 $ Set.toList contactIds
127 fusion'' :: HashMap ContactName (Set ContactId)
128 -> HashMap DocAuthor (Set DocId)
129 -> HashMap ContactId (Set DocId)
130 fusion'' mc md = hashmapReverse $ fusion' mc (hashmapReverse md)
133 fusion' :: HashMap ContactName (Set ContactId)
134 -> HashMap DocId (Set DocAuthor)
135 -> HashMap DocId (Set ContactId)
136 fusion' mc md = HM.fromListWith (<>)
137 $ map (\(docId, setAuthors) -> (docId, getContactIds mc $ getClosest' setAuthors (HM.keys mc)))
140 getContactIds :: HashMap ContactName (Set ContactId) -> Set ContactName -> Set ContactId
141 getContactIds mapContactNames contactNames =
142 if Set.null contactNames
144 else Set.unions $ catMaybes $ map (\contactName -> HM.lookup contactName mapContactNames) $ Set.toList contactNames
146 getClosest' :: Set DocAuthor -> [ContactName] -> Set ContactName
147 getClosest' setAuthors contactNames = trace (show (setAuthors, setContactNames)) $ setContactNames
149 setContactNames = if Set.null xs then ys else xs
150 xs = Set.fromList $ catMaybes $ map (\author -> getClosest Text.toLower author contactNames) $ Set.toList setAuthors
151 ys = Set.fromList $ catMaybes $ map (\(NgramsTerm author) -> case ((lastMay . (Text.splitOn " ")) author) of
153 Just authorReduced -> getClosest Text.toLower (NgramsTerm authorReduced) contactNames)
154 $ Set.toList setAuthors
157 getClosest :: (Text -> Text) -> NgramsTerm -> [NgramsTerm] -> Maybe NgramsTerm
158 getClosest f (NgramsTerm from) candidates = fst <$> head scored
160 scored = List.sortOn snd
161 $ List.filter (\(_,score) -> score <= 2)
162 $ map (\cand@(NgramsTerm candidate) -> (cand, levenshtein (f from) (f candidate))) candidates
165 ------------------------------------------------------------------------
166 getNgramsContactId :: AnnuaireId
167 -> Cmd err (HashMap ContactName (Set NodeId))
168 getNgramsContactId aId = do
169 contacts <- getAllContacts aId
170 -- printDebug "getAllContexts" (tr_count contacts)
171 let paired= HM.fromListWith (<>)
172 $ map (\contact -> (toName contact, Set.singleton (contact^.node_id))
174 -- printDebug "paired" (HM.keys paired)
176 -- POC here, should be a probabilistic function (see the one used to find lang)
177 toName :: Node HyperdataContact -> NgramsTerm
178 -- toName contact = NgramsTerm $ (Text.toTitle $ Text.take 1 firstName) <> ". " <> (Text.toTitle lastName)
179 toName contact = NgramsTerm $ (Text.toTitle firstName) <> " " <> (Text.toTitle lastName)
181 firstName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_firstName)
182 lastName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_lastName)
184 getNgramsDocId :: CorpusId
187 -> GargNoServer (HashMap DocAuthor (Set NodeId))
188 getNgramsDocId cId lId nt = do
189 lIds <- selectNodesWithUsername NodeList userMaster
190 repo <- getRepo (lId:lIds)
191 let ngs = filterListWithRoot [MapTerm, CandidateTerm] $ mapTermListRoot (lId:lIds) nt repo
192 -- printDebug "getNgramsDocId" ngs
194 groupNodesByNgrams ngs <$> getContextsByNgramsOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs)
196 hashmapReverse :: (Ord a, Eq b, Hashable b)
197 => HashMap a (Set b) -> HashMap b (Set a)
198 hashmapReverse m = HM.fromListWith (<>)
200 $ map (\(k,vs) -> [ (v, Set.singleton k) | v <- Set.toList vs])