2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
17 {-# LANGUAGE FlexibleContexts #-}
19 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
22 --import Control.Lens (view)
23 import Control.Monad.IO.Class (liftIO)
24 --import Gargantext.Core.Types
25 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
26 import Data.Map (Map, lookup)
27 import Data.Maybe (Maybe(..), catMaybes)
29 import Data.Text (Text, splitOn, intercalate)
30 import Data.Tuple.Extra (both)
31 import Data.List (concat)
32 import GHC.Show (Show)
33 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
34 import Gargantext.Core.Types.Individu (Username)
35 import Gargantext.Core.Types.Main
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
38 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
39 import Gargantext.Database.Metrics.TFICF (getTficf)
40 import Gargantext.Text.Terms (extractTerms)
41 import Gargantext.Text.Metrics.TFICF (Tficf(..))
42 import Gargantext.Database.Node.Document.Add (add)
43 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
46 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
47 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
48 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
49 import Gargantext.Database.Schema.User (getUser, UserLight(..))
50 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
51 import Gargantext.Database.Utils (Cmd, CmdM)
52 import Gargantext.Text.Terms (TermType(..))
53 import Gargantext.Ext.IMT (toSchoolName)
54 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
55 import Gargantext.Prelude
56 import Gargantext.Text.Parsers (parseDocs, FileFormat)
57 import System.FilePath (FilePath)
58 import Gargantext.API.Ngrams (HasRepoVar)
59 import Servant (ServantErr)
60 import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
61 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
62 import qualified Data.Map as DM
64 type FlowCmdM env err m =
72 flowCorpus :: FlowCmdM env ServantErr m
73 => FileFormat -> FilePath -> CorpusName -> m CorpusId
74 flowCorpus ff fp cName = do
75 --insertUsers [gargantuaUser, simpleUser]
76 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
77 params <- flowInsert NodeCorpus hyperdataDocuments' cName
78 flowCorpus' NodeCorpus hyperdataDocuments' params
81 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
82 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
83 flowInsert _nt hyperdataDocuments cName = do
84 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
86 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
87 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
89 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
90 _ <- add userCorpusId (map reId ids)
92 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
95 flowAnnuaire :: FlowCmdM env ServantErr m => FilePath -> m ()
96 flowAnnuaire filePath = do
97 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
98 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
99 printDebug "length annuaire" ps
102 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
103 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
104 flowInsertAnnuaire name children = do
106 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
107 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
109 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
110 _ <- add userCorpusId (map reId ids)
112 printDebug "AnnuaireID" userCorpusId
114 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
117 -- check userId CanFillUserCorpus userCorpusId
118 -- check masterUserId CanFillMasterCorpus masterCorpusId
122 -- InsertedNodeNgrams ?
123 flowCorpus' :: FlowCmdM env err m
124 => NodeType -> [HyperdataDocument]
125 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
127 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
128 --------------------------------------------------
130 userListId <- flowListUser userId userCorpusId 500
131 printDebug "Working on User ListId : " userListId
133 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
134 printDebug "documentsWithId" documentsWithId
135 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
136 printDebug "docsWithNgrams" docsWithNgrams
137 let maps = mapNodeIdNgrams docsWithNgrams
139 -- printDebug "maps" (maps)
140 terms2id <- insertNgrams $ DM.keys maps
141 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
142 printDebug "inserted ngrams" indexedNgrams
143 _ <- insertToNodeNgrams indexedNgrams
145 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
146 printDebug "Working on ListId : " listId2
148 --------------------------------------------------
149 _ <- mkDashboard userCorpusId userId
150 _ <- mkGraph userCorpusId userId
153 -- _ <- mkAnnuaire rootUserId userId
156 -- del [corpusId2, corpusId]
158 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
159 flowCorpus' _ _ _ = undefined
162 type CorpusName = Text
164 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
165 subFlowCorpus username cName = do
166 maybeUserId <- getUser username
167 userId <- case maybeUserId of
168 Nothing -> nodeError NoUserFound
169 -- mk NodeUser gargantua_id "Node Gargantua"
170 Just user -> pure $ userLight_id user
172 printDebug "userId" userId
173 rootId' <- map _node_id <$> getRoot username
175 printDebug "rootId'" rootId'
176 rootId'' <- case rootId' of
177 [] -> mkRoot username userId
178 n -> case length n >= 2 of
179 True -> nodeError ManyNodeUsers
180 False -> pure rootId'
182 printDebug "rootId''" rootId''
183 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
185 corpusId'' <- if username == userMaster
187 ns <- getCorporaWithParentId rootId
188 pure $ map _node_id ns
192 corpusId' <- if corpusId'' /= []
194 else mkCorpus (Just cName) Nothing rootId userId
196 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
198 printDebug "(username, userId, rootId, corpusId)"
199 (username, userId, rootId, corpusId)
200 pure (userId, rootId, corpusId)
203 subFlowAnnuaire :: HasNodeError err =>
204 Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
205 subFlowAnnuaire username _cName = do
206 maybeUserId <- getUser username
208 userId <- case maybeUserId of
209 Nothing -> nodeError NoUserFound
210 -- mk NodeUser gargantua_id "Node Gargantua"
211 Just user -> pure $ userLight_id user
213 rootId' <- map _node_id <$> getRoot username
215 rootId'' <- case rootId' of
216 [] -> mkRoot username userId
217 n -> case length n >= 2 of
218 True -> nodeError ManyNodeUsers
219 False -> pure rootId'
220 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
222 corpusId' <- mkAnnuaire rootId userId
224 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
226 printDebug "(username, userId, rootId, corpusId)"
227 (username, userId, rootId, corpusId)
228 pure (userId, rootId, corpusId)
230 ------------------------------------------------------------------------
231 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
232 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
234 err = "Database.Flow.toInsert"
236 toInserted :: [ReturnId] -> Map HashId ReturnId
237 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
238 . filter (\r -> reInserted r == True)
240 data DocumentWithId =
241 DocumentWithId { documentId :: !NodeId
242 , documentData :: !HyperdataDocument
245 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
246 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
248 toDocumentWithId (hash,hpd) =
249 DocumentWithId <$> fmap reId (lookup hash rs)
252 ------------------------------------------------------------------------
253 data DocumentIdWithNgrams =
255 { documentWithId :: !DocumentWithId
256 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
260 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
261 extractNgramsT doc = do
262 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
263 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
264 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
265 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
266 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
268 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
269 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
270 <> [(a', DM.singleton Authors 1) | a' <- authors ]
271 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
275 documentIdWithNgrams :: HasNodeError err
276 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
277 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
278 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
280 toDocumentIdWithNgrams d = do
281 e <- f $ documentData d
282 pure $ DocumentIdWithNgrams d e
284 -- | TODO check optimization
285 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
286 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
288 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
289 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
291 nId = documentId $ documentWithId d
293 ------------------------------------------------------------------------
294 flowList :: HasNodeError err => UserId -> CorpusId
295 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
296 flowList uId cId ngs = do
297 -- printDebug "ngs:" ngs
298 lId <- getOrMkList cId uId
299 printDebug "ngs" (DM.keys ngs)
300 -- TODO add stemming equivalence of 2 ngrams
302 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
303 -- _ <- insertGroups lId groupEd
305 -- compute Candidate / Map
306 is <- insertLists lId $ ngrams2list ngs
307 printDebug "listNgrams inserted :" is
311 flowListUser :: FlowCmdM env err m
312 => UserId -> CorpusId -> Int -> m ListId
313 flowListUser uId cId n = do
314 lId <- getOrMkList cId uId
315 -- is <- insertLists lId $ ngrams2list ngs
317 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
318 -- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
320 insertNewListOfNgramsElements lId NgramsTerms $
321 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
326 ------------------------------------------------------------------------
331 * DM.keys called twice
332 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
333 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
334 -> Map NgramsIndexed NgramsIndexed
335 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
339 -- TODO check: do not insert duplicates
340 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
341 insertGroups lId ngrs =
342 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
343 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
347 ------------------------------------------------------------------------
348 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
349 -> [(ListType, (NgramsType,NgramsIndexed))]
351 [ (CandidateList, (t, ng))
352 | (ng, tm) <- DM.toList m
356 -- | TODO: weight of the list could be a probability
357 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
358 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
359 | (l,(ngt, ng)) <- lngs
361 ------------------------------------------------------------------------