]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Merge branch 'dev-ngrams-table' into dev
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
18 where
19
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Text.Terms (extractTerms)
37 import Gargantext.Database.Node.Document.Add (add)
38 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
39 import Gargantext.Database.Root (getRoot)
40 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
41 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
42 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Schema.User (getUser, UserLight(..))
45 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
46 import Gargantext.Database.Utils (Cmd)
47 import Gargantext.Text.Terms (TermType(..))
48 import Gargantext.Ext.IMT (toSchoolName)
49 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
50 import Gargantext.Prelude
51 import Gargantext.Text.Parsers (parseDocs, FileFormat)
52 import System.FilePath (FilePath)
53 import qualified Data.Map as DM
54
55
56 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
57 flowCorpus ff fp cName = do
58 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
59 params <- flowInsert NodeCorpus hyperdataDocuments' cName
60 flowCorpus' NodeCorpus hyperdataDocuments' params
61
62
63 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
64 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
65 flowInsert _nt hyperdataDocuments cName = do
66 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
67
68 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
69 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
70
71 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
72 _ <- add userCorpusId (map reId ids)
73
74 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
75
76
77 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
78 flowAnnuaire filePath = do
79 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
80 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
81 printDebug "length annuaire" ps
82
83
84 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
85 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
86 flowInsertAnnuaire name children = do
87
88 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
89 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
90
91 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
92 _ <- add userCorpusId (map reId ids)
93
94 printDebug "AnnuaireID" userCorpusId
95
96 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
97
98
99 flowCorpus' :: HasNodeError err
100 => NodeType -> [HyperdataDocument]
101 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
102 -> Cmd err CorpusId
103 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
104 --------------------------------------------------
105 -- List Ngrams Flow
106 userListId <- flowListUser userId userCorpusId
107 printDebug "Working on User ListId : " userListId
108
109 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
110 -- printDebug "documentsWithId" documentsWithId
111 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
112 -- printDebug "docsWithNgrams" docsWithNgrams
113 let maps = mapNodeIdNgrams docsWithNgrams
114
115 -- printDebug "maps" (maps)
116 terms2id <- insertNgrams $ DM.keys maps
117 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
118 -- printDebug "inserted ngrams" indexedNgrams
119 _ <- insertToNodeNgrams indexedNgrams
120
121 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
122 printDebug "Working on ListId : " listId2
123 --}
124 --------------------------------------------------
125 _ <- mkDashboard userCorpusId userId
126 _ <- mkGraph userCorpusId userId
127
128 -- Annuaire Flow
129 -- _ <- mkAnnuaire rootUserId userId
130
131 pure userCorpusId
132 -- del [corpusId2, corpusId]
133
134 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
135 flowCorpus' _ _ _ = undefined
136
137
138 type CorpusName = Text
139
140 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
141 subFlowCorpus username cName = do
142 maybeUserId <- getUser username
143
144 userId <- case maybeUserId of
145 Nothing -> nodeError NoUserFound
146 -- mk NodeUser gargantua_id "Node Gargantua"
147 Just user -> pure $ userLight_id user
148
149 rootId' <- map _node_id <$> getRoot username
150
151 rootId'' <- case rootId' of
152 [] -> mkRoot username userId
153 n -> case length n >= 2 of
154 True -> nodeError ManyNodeUsers
155 False -> pure rootId'
156 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
157
158 corpusId'' <- if username == userMaster
159 then do
160 ns <- getCorporaWithParentId rootId
161 pure $ map _node_id ns
162 else
163 pure []
164
165 corpusId' <- if corpusId'' /= []
166 then pure corpusId''
167 else mkCorpus (Just cName) Nothing rootId userId
168
169 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
170
171 printDebug "(username, userId, rootId, corpusId)"
172 (username, userId, rootId, corpusId)
173 pure (userId, rootId, corpusId)
174
175
176 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
177 subFlowAnnuaire username _cName = do
178 maybeUserId <- getUser username
179
180 userId <- case maybeUserId of
181 Nothing -> nodeError NoUserFound
182 -- mk NodeUser gargantua_id "Node Gargantua"
183 Just user -> pure $ userLight_id user
184
185 rootId' <- map _node_id <$> getRoot username
186
187 rootId'' <- case rootId' of
188 [] -> mkRoot username userId
189 n -> case length n >= 2 of
190 True -> nodeError ManyNodeUsers
191 False -> pure rootId'
192 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
193
194 corpusId' <- mkAnnuaire rootId userId
195
196 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
197
198 printDebug "(username, userId, rootId, corpusId)"
199 (username, userId, rootId, corpusId)
200 pure (userId, rootId, corpusId)
201
202 ------------------------------------------------------------------------
203 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
204 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
205 where
206 err = "Database.Flow.toInsert"
207
208 toInserted :: [ReturnId] -> Map HashId ReturnId
209 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
210 . filter (\r -> reInserted r == True)
211
212 data DocumentWithId =
213 DocumentWithId { documentId :: !NodeId
214 , documentData :: !HyperdataDocument
215 } deriving (Show)
216
217 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
218 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
219 where
220 toDocumentWithId (hash,hpd) =
221 DocumentWithId <$> fmap reId (lookup hash rs)
222 <*> Just hpd
223
224 ------------------------------------------------------------------------
225 data DocumentIdWithNgrams =
226 DocumentIdWithNgrams
227 { documentWithId :: !DocumentWithId
228 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
229 } deriving (Show)
230
231 -- TODO group terms
232 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
233 extractNgramsT doc = do
234 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
235 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
236 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
237 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
238 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
239
240 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
241 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
242 <> [(a', DM.singleton Authors 1) | a' <- authors ]
243 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
244
245
246
247 documentIdWithNgrams :: HasNodeError err
248 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
249 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
250 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
251 where
252 toDocumentIdWithNgrams d = do
253 e <- f $ documentData d
254 pure $ DocumentIdWithNgrams d e
255
256 -- | TODO check optimization
257 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
258 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
259 where
260 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
261 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
262 where
263 nId = documentId $ documentWithId d
264
265 ------------------------------------------------------------------------
266 flowList :: HasNodeError err => UserId -> CorpusId
267 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
268 flowList uId cId ngs = do
269 -- printDebug "ngs:" ngs
270 lId <- getOrMkList cId uId
271 --printDebug "ngs" (DM.keys ngs)
272 -- TODO add stemming equivalence of 2 ngrams
273 -- TODO needs rework
274 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
275 -- _ <- insertGroups lId groupEd
276
277 -- compute Candidate / Map
278 is <- insertLists lId $ ngrams2list ngs
279 printDebug "listNgrams inserted :" is
280
281 pure lId
282
283 flowListUser :: HasNodeError err => UserId -> CorpusId -> Cmd err NodeId
284 flowListUser uId cId = getOrMkList cId uId
285
286 ------------------------------------------------------------------------
287
288 {-
289 TODO rework:
290 * quadratic
291 * DM.keys called twice
292 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
293 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
294 -> Map NgramsIndexed NgramsIndexed
295 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
296 -}
297
298
299 -- TODO check: do not insert duplicates
300 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
301 insertGroups lId ngrs =
302 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
303 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
304 , ng1 /= ng2
305 ]
306
307 ------------------------------------------------------------------------
308 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
309 -> [(ListType, (NgramsType,NgramsIndexed))]
310 ngrams2list m =
311 [ (GraphList, (t, ng))
312 | (ng, tm) <- DM.toList m
313 , t <- DM.keys tm
314 ]
315
316 -- | TODO: weight of the list could be a probability
317 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
318 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
319 | (l,(ngt, ng)) <- lngs
320 ]
321 ------------------------------------------------------------------------