]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow/Pairing.hs
[ngrams by context] fix subquery to limit only to current corpus
[gargantext.git] / src / Gargantext / Database / Action / Flow / Pairing.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE QuasiQuotes #-}
13 {-# LANGUAGE Arrows #-}
14
15 module Gargantext.Database.Action.Flow.Pairing
16 -- (pairing)
17 where
18
19 import Debug.Trace (trace)
20 import Control.Lens (_Just, (^.))
21 import Data.Hashable (Hashable)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Maybe (fromMaybe, catMaybes)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Gargantext.API.Ngrams.Tools
27 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
28 import Gargantext.API.Prelude (GargNoServer)
29 import Gargantext.Core
30 import Gargantext.Core.Text.Metrics.CharByChar (levenshtein)
31 import Gargantext.Core.Types (TableResult(..))
32 import Gargantext.Core.Types.Main
33 import Gargantext.Database
34 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsOnlyUser)
35 import Gargantext.Database.Admin.Config
36 import Gargantext.Database.Admin.Types.Hyperdata -- (HyperdataContact(..))
37 import Gargantext.Database.Admin.Types.Node -- (AnnuaireId, CorpusId, ListId, DocId, ContactId, NodeId)
38 import Gargantext.Database.Query.Prelude (leftJoin2, returnA, queryNodeNodeTable)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Children (getAllContacts)
41 import Gargantext.Database.Query.Table.Node.Select (selectNodesWithUsername)
42 import Gargantext.Database.Query.Table.NodeContext_NodeContext (insertNodeContext_NodeContext)
43 import Gargantext.Database.Query.Table.NodeNode (insertNodeNode)
44 import Gargantext.Database.Schema.Ngrams -- (NgramsType(..))
45 import Gargantext.Database.Schema.Node
46 -- import Gargantext.Database.Schema.Context
47 import qualified Data.HashMap.Strict as HM
48 import Gargantext.Prelude hiding (sum)
49 import Opaleye
50 import qualified Data.HashMap.Strict as HashMap
51 import qualified Data.List as List
52 import qualified Data.Set as Set
53 import qualified Data.Text as Text
54
55 -- | isPairedWith
56 -- All NodeAnnuaire paired with a Corpus of NodeId nId:
57 -- isPairedWith NodeAnnuaire corpusId
58 isPairedWith :: NodeId -> NodeType -> Cmd err [NodeId]
59 isPairedWith nId nt = runOpaQuery (selectQuery nt nId)
60 where
61 selectQuery :: NodeType -> NodeId -> Select (Column SqlInt4)
62 selectQuery nt' nId' = proc () -> do
63 (node, node_node) <- queryJoin -< ()
64 restrict -< (node^.node_typename) .== (sqlInt4 $ toDBid nt')
65 restrict -< (node_node^.nn_node1_id) .== (toNullable $ pgNodeId nId')
66 returnA -< node^.node_id
67
68 queryJoin :: Select (NodeRead, NodeNodeReadNull)
69 queryJoin = leftJoin2 queryNodeTable queryNodeNodeTable cond
70 where
71 cond (node, node_node) = node^.node_id .== node_node^. nn_node2_id
72
73 -----------------------------------------------------------------------
74 pairing :: AnnuaireId -> CorpusId -> Maybe ListId -> GargNoServer [Int]
75 pairing a c l' = do
76 l <- case l' of
77 Nothing -> defaultList c
78 Just l'' -> pure l''
79 dataPaired <- dataPairing a (c,l,Authors)
80 _ <- insertNodeNode [ NodeNode c a Nothing Nothing]
81 insertNodeContext_NodeContext $ prepareInsert c a dataPaired
82
83
84 dataPairing :: AnnuaireId
85 -> (CorpusId, ListId, NgramsType)
86 -> GargNoServer (HashMap ContactId (Set DocId))
87 dataPairing aId (cId, lId, ngt) = do
88 -- mc :: HM.HashMap ContactName (Set ContactId)
89 mc <- getNgramsContactId aId
90 -- md :: HM.HashMap DocAuthor (Set DocId)
91 md <- getNgramsDocId cId lId ngt
92 -- printDebug "dataPairing authors" (HM.keys md)
93 let result = fusion mc md
94 -- printDebug "dataPairing" (length $ HM.keys result)
95 pure result
96
97
98 prepareInsert :: CorpusId -> AnnuaireId -> HashMap ContactId (Set DocId)
99 -> [(CorpusId, AnnuaireId, DocId, ContactId)]
100 prepareInsert corpusId annuaireId mapContactDocs =
101 map (\(contactId,docId) -> (corpusId, docId, annuaireId, contactId))
102 $ List.concat
103 $ map (\(contactId, setDocIds)
104 -> map (\setDocId
105 -> (contactId, setDocId)
106 ) $ Set.toList setDocIds
107 )
108 $ HM.toList mapContactDocs
109
110 ------------------------------------------------------------------------
111 type ContactName = NgramsTerm
112 type DocAuthor = NgramsTerm
113 type Projected = NgramsTerm
114
115 fusion :: HashMap ContactName (Set ContactId)
116 -> HashMap DocAuthor (Set DocId)
117 -> HashMap ContactId (Set DocId)
118 fusion mc md = HM.fromListWith (<>)
119 $ List.concat
120 $ map (\(docAuthor, docs)
121 -> case (getClosest Text.toLower docAuthor (HM.keys mc)) of
122 Nothing -> []
123 Just author -> case HM.lookup author mc of
124 Nothing -> []
125 Just contactIds -> map (\contactId -> (contactId, docs))
126 $ Set.toList contactIds
127 )
128 $ HM.toList md
129
130 fusion'' :: HashMap ContactName (Set ContactId)
131 -> HashMap DocAuthor (Set DocId)
132 -> HashMap ContactId (Set DocId)
133 fusion'' mc md = hashmapReverse $ fusion' mc (hashmapReverse md)
134
135
136 fusion' :: HashMap ContactName (Set ContactId)
137 -> HashMap DocId (Set DocAuthor)
138 -> HashMap DocId (Set ContactId)
139 fusion' mc md = HM.fromListWith (<>)
140 $ map (\(docId, setAuthors) -> (docId, getContactIds mc $ getClosest' setAuthors (HM.keys mc)))
141 $ HM.toList md
142
143 getContactIds :: HashMap ContactName (Set ContactId) -> Set ContactName -> Set ContactId
144 getContactIds mapContactNames contactNames =
145 if Set.null contactNames
146 then Set.empty
147 else Set.unions $ catMaybes $ map (\contactName -> HM.lookup contactName mapContactNames) $ Set.toList contactNames
148
149 getClosest' :: Set DocAuthor -> [ContactName] -> Set ContactName
150 getClosest' setAuthors contactNames = trace (show (setAuthors, setContactNames)) $ setContactNames
151 where
152 setContactNames = if Set.null xs then ys else xs
153 xs = Set.fromList $ catMaybes $ map (\author -> getClosest Text.toLower author contactNames) $ Set.toList setAuthors
154 ys = Set.fromList $ catMaybes $ map (\(NgramsTerm author) -> case ((lastMay . (Text.splitOn " ")) author) of
155 Nothing -> Nothing
156 Just authorReduced -> getClosest Text.toLower (NgramsTerm authorReduced) contactNames)
157 $ Set.toList setAuthors
158
159
160 getClosest :: (Text -> Text) -> NgramsTerm -> [NgramsTerm] -> Maybe NgramsTerm
161 getClosest f (NgramsTerm from) candidates = fst <$> head scored
162 where
163 scored = List.sortOn snd
164 $ List.filter (\(_,score) -> score <= 2)
165 $ map (\cand@(NgramsTerm candidate) -> (cand, levenshtein (f from) (f candidate))) candidates
166
167
168 ------------------------------------------------------------------------
169 getNgramsContactId :: AnnuaireId
170 -> Cmd err (HashMap ContactName (Set NodeId))
171 getNgramsContactId aId = do
172 contacts <- getAllContacts aId
173 -- printDebug "getAllContexts" (tr_count contacts)
174 let paired= HM.fromListWith (<>)
175 $ map (\contact -> (toName contact, Set.singleton (contact^.node_id))
176 ) (tr_docs contacts)
177 -- printDebug "paired" (HM.keys paired)
178 pure paired
179 -- POC here, should be a probabilistic function (see the one used to find lang)
180 toName :: Node HyperdataContact -> NgramsTerm
181 -- toName contact = NgramsTerm $ (Text.toTitle $ Text.take 1 firstName) <> ". " <> (Text.toTitle lastName)
182 toName contact = NgramsTerm $ (Text.toTitle firstName) <> " " <> (Text.toTitle lastName)
183 where
184 firstName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_firstName)
185 lastName = fromMaybe "" $ contact^.(node_hyperdata . hc_who . _Just . cw_lastName)
186
187 getNgramsDocId :: CorpusId
188 -> ListId
189 -> NgramsType
190 -> GargNoServer (HashMap DocAuthor (Set NodeId))
191 getNgramsDocId cId lId nt = do
192 lIds <- selectNodesWithUsername NodeList userMaster
193 repo <- getRepo (lId:lIds)
194 let ngs = filterListWithRoot [MapTerm, CandidateTerm] $ mapTermListRoot (lId:lIds) nt repo
195 -- printDebug "getNgramsDocId" ngs
196
197 groupNodesByNgrams ngs <$> getContextsByNgramsOnlyUser cId (lIds <> [lId]) nt (HashMap.keys ngs)
198
199 hashmapReverse :: (Ord a, Eq b, Hashable b)
200 => HashMap a (Set b) -> HashMap b (Set a)
201 hashmapReverse m = HM.fromListWith (<>)
202 $ List.concat
203 $ map (\(k,vs) -> [ (v, Set.singleton k) | v <- Set.toList vs])
204 $ HM.toList m