]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow/Pairing.hs
[FIX] Clean Text before sending it to NLP micro services + tests + clean code for...
[gargantext.git] / src / Gargantext / Database / Action / Flow / Pairing.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE QuasiQuotes #-}
13 {-# LANGUAGE Arrows #-}
14
15 module Gargantext.Database.Action.Flow.Pairing
16 -- (pairing)
17 where
18
19 import Debug.Trace (trace)
20 import Control.Lens (_Just, (^.), view)
21 import Data.Hashable (Hashable)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Maybe (fromMaybe, catMaybes)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Gargantext.API.Ngrams.Tools
27 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
28 import Gargantext.API.Prelude (GargNoServer)
29 import Gargantext.Core
30 import Gargantext.Core.Text.Metrics.CharByChar (levenshtein)
31 import Gargantext.Core.Types (TableResult(..))
32 import Gargantext.Core.Types.Main
33 import Gargantext.Database
34 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsOnlyUser)
35 import Gargantext.Database.Admin.Config
36 import Gargantext.Database.Admin.Types.Hyperdata -- (HyperdataContact(..))
37 import Gargantext.Database.Admin.Types.Node -- (AnnuaireId, CorpusId, ListId, DocId, ContactId, NodeId)
38 import Gargantext.Database.Query.Prelude (returnA, queryNodeNodeTable)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Children (getAllContacts)
41 import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
42 import Gargantext.Database.Query.Table.NodeContext_NodeContext (insertNodeContext_NodeContext)
43 import Gargantext.Database.Query.Table.NodeNode (insertNodeNode)
44 import Gargantext.Database.Schema.Ngrams -- (NgramsType(..))
45 import Gargantext.Database.Schema.Node
46 -- import Gargantext.Database.Schema.Context
47 import qualified Data.HashMap.Strict as HM
48 import Gargantext.Prelude hiding (sum)
49 import Opaleye
50 import qualified Data.HashMap.Strict as HashMap
51 import qualified Data.List as List
52 import qualified Data.Set as Set
53 import qualified Data.Text as Text
54
55 -- | isPairedWith
56 -- All NodeAnnuaire paired with a Corpus of NodeId nId:
57 -- isPairedWith NodeAnnuaire corpusId
58 isPairedWith :: NodeId -> NodeType -> Cmd err [NodeId]
59 isPairedWith nId nt = runOpaQuery (selectQuery nt nId)
60 where
61 selectQuery :: NodeType -> NodeId -> Select (Column SqlInt4)
62 selectQuery nt' nId' = proc () -> do
63 node <- queryNodeTable -< ()
64 node_node <- optionalRestrict queryNodeNodeTable -<
65 \node_node' -> (node ^. node_id) .== (node_node' ^. nn_node2_id)
66 restrict -< (node^.node_typename) .== sqlInt4 (toDBid nt')
67 restrict -< (view nn_node1_id <$> node_node) .=== justFields (pgNodeId nId')
68 returnA -< node^.node_id
69
70 -----------------------------------------------------------------------
71 pairing :: AnnuaireId -> CorpusId -> Maybe ListId -> GargNoServer [Int]
72 pairing a c l' = do
73 l <- case l' of
74 Nothing -> defaultList c
75 Just l'' -> pure l''
76 dataPaired <- dataPairing a (c,l,Authors)
77 _ <- insertNodeNode [ NodeNode c a Nothing Nothing]
78 insertNodeContext_NodeContext $ prepareInsert c a dataPaired
79
80
81 dataPairing :: AnnuaireId
82 -> (CorpusId, ListId, NgramsType)
83 -> GargNoServer (HashMap ContactId (Set DocId))
84 dataPairing aId (cId, lId, ngt) = do
85 -- mc :: HM.HashMap ContactName (Set ContactId)
86 mc <- getNgramsContactId aId
87 -- md :: HM.HashMap DocAuthor (Set DocId)
88 md <- getNgramsDocId cId lId ngt
89 -- printDebug "dataPairing authors" (HM.keys md)
90 let result = fusion mc md
91 -- printDebug "dataPairing" (length $ HM.keys result)
92 pure result
93
94
95 prepareInsert :: CorpusId -> AnnuaireId -> HashMap ContactId (Set DocId)
96 -> [(CorpusId, AnnuaireId, DocId, ContactId)]
97 prepareInsert corpusId annuaireId mapContactDocs =
98 map (\(contactId,docId) -> (corpusId, docId, annuaireId, contactId))
99 $ List.concat
100 $ map (\(contactId, setDocIds)
101 -> map (\setDocId
102 -> (contactId, setDocId)
103 ) $ Set.toList setDocIds
104 )
105 $ HM.toList mapContactDocs
106
107 ------------------------------------------------------------------------
108 type ContactName = NgramsTerm
109 type DocAuthor = NgramsTerm
110 type Projected = NgramsTerm
111
112 fusion :: HashMap ContactName (Set ContactId)
113 -> HashMap DocAuthor (Set DocId)
114 -> HashMap ContactId (Set DocId)
115 fusion mc md = HM.fromListWith (<>)
116 $ List.concat
117 $ map (\(docAuthor, docs)
118 -> case (getClosest Text.toLower docAuthor (HM.keys mc)) of
119 Nothing -> []
120 Just author -> case HM.lookup author mc of
121 Nothing -> []
122 Just contactIds -> map (\contactId -> (contactId, docs))
123 $ Set.toList contactIds
124 )
125 $ HM.toList md
126
127 fusion'' :: HashMap ContactName (Set ContactId)
128 -> HashMap DocAuthor (Set DocId)
129 -> HashMap ContactId (Set DocId)
130 fusion'' mc md = hashmapReverse $ fusion' mc (hashmapReverse md)
131
132
133 fusion' :: HashMap ContactName (Set ContactId)
134 -> HashMap DocId (Set DocAuthor)
135 -> HashMap DocId (Set ContactId)
136 fusion' mc md = HM.fromListWith (<>)
137 $ map (\(docId, setAuthors) -> (docId, getContactIds mc $ getClosest' setAuthors (HM.keys mc)))
138 $ HM.toList md
139
140 getContactIds :: HashMap ContactName (Set ContactId) -> Set ContactName -> Set ContactId
141 getContactIds mapContactNames contactNames =
142 if Set.null contactNames
143 then Set.empty
144 else Set.unions $ catMaybes $ map (\contactName -> HM.lookup contactName mapContactNames) $ Set.toList contactNames
145
146 getClosest' :: Set DocAuthor -> [ContactName] -> Set ContactName
147 getClosest' setAuthors contactNames = trace (show (setAuthors, setContactNames)) $ setContactNames
148 where
149 setContactNames = if Set.null xs then ys else xs
150 xs = Set.fromList $ catMaybes $ map (\author -> getClosest Text.toLower author contactNames) $ Set.toList setAuthors
151 ys = Set.fromList $ catMaybes $ map (\(NgramsTerm author) -> case ((lastMay . (Text.splitOn " ")) author) of
152 Nothing -> Nothing
153 Just authorReduced -> getClosest Text.toLower (NgramsTerm authorReduced) contactNames)
154 $ Set.toList setAuthors
155
156
157 getClosest :: (Text -> Text) -> NgramsTerm -> [NgramsTerm] -> Maybe NgramsTerm
158 getClosest f (NgramsTerm from) candidates = fst <$> head scored
159 where
160 scored = List.sortOn snd
161 $ List.filter (\(_,score) -> score <= 2)
162 $ map (\cand@(NgramsTerm candidate) -> (cand, levenshtein (f from) (f candidate))) candidates
163
164
165 ------------------------------------------------------------------------
166 getNgramsContactId :: AnnuaireId
167 -> Cmd err (HashMap ContactName (Set NodeId))
168 getNgramsContactId aId = do
169 contacts <- getAllContacts aId
170 -- printDebug "getAllContexts" (tr_count contacts)
171 let paired= HM.fromListWith (<>)
172 $ map (\contact -> (toName contact, Set.singleton (contact^.node_id))
173 ) (tr_docs contacts)
174 -- printDebug "paired" (HM.keys paired)
175 pure paired
176 -- POC here, should be a probabilistic function (see the one used to find lang)
177 toName :: Node HyperdataContact -> NgramsTerm
178 -- toName contact = NgramsTerm $ (Text.toTitle $ Text.take 1 firstName) <> ". " <> (Text.toTitle lastName)
179 toName contact = NgramsTerm $ (Text.toTitle firstName) <> " " <> (Text.toTitle lastName)
180 where
181 firstName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_firstName)
182 lastName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_lastName)
183
184 getNgramsDocId :: CorpusId
185 -> ListId
186 -> NgramsType
187 -> GargNoServer (HashMap DocAuthor (Set NodeId))
188 getNgramsDocId cId lId nt = do
189 lIds <- selectNodesWithUsername NodeList userMaster
190 repo <- getRepo (lId:lIds)
191 let ngs = filterListWithRoot [MapTerm, CandidateTerm] $ mapTermListRoot (lId:lIds) nt repo
192 -- printDebug "getNgramsDocId" ngs
193
194 groupNodesByNgrams ngs <$> getContextsByNgramsOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs)
195
196 hashmapReverse :: (Ord a, Eq b, Hashable b)
197 => HashMap a (Set b) -> HashMap b (Set a)
198 hashmapReverse m = HM.fromListWith (<>)
199 $ List.concat
200 $ map (\(k,vs) -> [ (v, Set.singleton k) | v <- Set.toList vs])
201 $ HM.toList m