2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
27 import Data.Text (Text, splitOn, intercalate)
28 import Data.Tuple.Extra (both)
29 import Data.List (concat)
30 import GHC.Show (Show)
31 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
32 import Gargantext.Core.Types.Individu (Username)
33 import Gargantext.Core.Types.Main
34 import Gargantext.Core (Lang(..))
35 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
36 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
37 import Gargantext.Database.Metrics.TFICF (getTficf)
38 import Gargantext.Text.Terms (extractTerms)
39 import Gargantext.Text.Metrics.TFICF (Tficf(..))
40 import Gargantext.Database.Node.Document.Add (add)
41 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
42 import Gargantext.Database.Root (getRoot)
43 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
44 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
45 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
46 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
47 import Gargantext.Database.Schema.User (getUser, UserLight(..))
48 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
49 import Gargantext.Database.Utils (Cmd)
50 import Gargantext.Text.Terms (TermType(..))
51 import Gargantext.Ext.IMT (toSchoolName)
52 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
53 import Gargantext.Prelude
54 import Gargantext.Text.Parsers (parseDocs, FileFormat)
55 import System.FilePath (FilePath)
57 import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
59 import qualified Data.Map as DM
62 flowCorpus :: RepoCmdM env err m => FileFormat -> FilePath -> CorpusName -> m CorpusId
63 flowCorpus ff fp cName = do
64 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
65 params <- flowInsert NodeCorpus hyperdataDocuments' cName
66 flowCorpus' NodeCorpus hyperdataDocuments' params
69 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
70 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
71 flowInsert _nt hyperdataDocuments cName = do
72 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
74 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
75 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
77 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
78 _ <- add userCorpusId (map reId ids)
80 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
83 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
84 flowAnnuaire filePath = do
85 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
86 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
87 printDebug "length annuaire" ps
90 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
91 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
92 flowInsertAnnuaire name children = do
94 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
95 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
97 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
98 _ <- add userCorpusId (map reId ids)
100 --printDebug "AnnuaireID" userCorpusId
102 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
105 -- check userId CanFillUserCorpus userCorpusId
106 -- check masterUserId CanFillMasterCorpus masterCorpusId
110 -- InsertedNodeNgrams ?
111 flowCorpus' :: RepoCmdM env err m
112 => NodeType -> [HyperdataDocument]
113 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
115 flowCorpus' NodeCorpus hyperdataDocuments (ids,_masterUserId,_masterCorpusId, userId,userCorpusId) = do
116 --------------------------------------------------
118 _userListId <- flowListUser userId userCorpusId 500
119 --printDebug "Working on User ListId : " userListId
121 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
122 -- printDebug "documentsWithId" documentsWithId
123 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
124 -- printDebug "docsWithNgrams" docsWithNgrams
125 let maps = mapNodeIdNgrams docsWithNgrams
127 -- printDebug "maps" (maps)
128 terms2id <- insertNgrams $ DM.keys maps
129 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
130 -- printDebug "inserted ngrams" indexedNgrams
131 _ <- insertToNodeNgrams indexedNgrams
133 --listId2 <- flowList masterUserId masterCorpusId indexedNgrams
134 --printDebug "Working on ListId : " listId2
136 --------------------------------------------------
137 _ <- mkDashboard userCorpusId userId
138 _ <- mkGraph userCorpusId userId
141 -- _ <- mkAnnuaire rootUserId userId
144 -- del [corpusId2, corpusId]
146 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
147 flowCorpus' _ _ _ = undefined
150 type CorpusName = Text
152 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
153 subFlowCorpus username cName = do
154 maybeUserId <- getUser username
156 userId <- case maybeUserId of
157 Nothing -> nodeError NoUserFound
158 -- mk NodeUser gargantua_id "Node Gargantua"
159 Just user -> pure $ userLight_id user
161 rootId' <- map _node_id <$> getRoot username
163 rootId'' <- case rootId' of
164 [] -> mkRoot username userId
165 n -> case length n >= 2 of
166 True -> nodeError ManyNodeUsers
167 False -> pure rootId'
168 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
170 corpusId'' <- if username == userMaster
172 ns <- getCorporaWithParentId rootId
173 pure $ map _node_id ns
177 corpusId' <- if corpusId'' /= []
179 else mkCorpus (Just cName) Nothing rootId userId
181 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
183 --printDebug "(username, userId, rootId, corpusId)"
184 -- (username, userId, rootId, corpusId)
185 pure (userId, rootId, corpusId)
188 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
189 subFlowAnnuaire username _cName = do
190 maybeUserId <- getUser username
192 userId <- case maybeUserId of
193 Nothing -> nodeError NoUserFound
194 -- mk NodeUser gargantua_id "Node Gargantua"
195 Just user -> pure $ userLight_id user
197 rootId' <- map _node_id <$> getRoot username
199 rootId'' <- case rootId' of
200 [] -> mkRoot username userId
201 n -> case length n >= 2 of
202 True -> nodeError ManyNodeUsers
203 False -> pure rootId'
204 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
206 corpusId' <- mkAnnuaire rootId userId
208 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
210 --printDebug "(username, userId, rootId, corpusId)"
211 -- (username, userId, rootId, corpusId)
212 pure (userId, rootId, corpusId)
214 ------------------------------------------------------------------------
215 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
216 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
218 err = "Database.Flow.toInsert"
220 toInserted :: [ReturnId] -> Map HashId ReturnId
221 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
222 . filter (\r -> reInserted r == True)
224 data DocumentWithId =
225 DocumentWithId { documentId :: !NodeId
226 , documentData :: !HyperdataDocument
229 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
230 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
232 toDocumentWithId (hash,hpd) =
233 DocumentWithId <$> fmap reId (lookup hash rs)
236 ------------------------------------------------------------------------
237 data DocumentIdWithNgrams =
239 { documentWithId :: !DocumentWithId
240 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
244 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
245 extractNgramsT doc = do
246 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
247 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
248 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
249 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
250 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
252 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
253 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
254 <> [(a', DM.singleton Authors 1) | a' <- authors ]
255 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
259 documentIdWithNgrams :: HasNodeError err
260 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
261 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
262 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
264 toDocumentIdWithNgrams d = do
265 e <- f $ documentData d
266 pure $ DocumentIdWithNgrams d e
268 -- | TODO check optimization
269 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
270 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
272 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
273 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
275 nId = documentId $ documentWithId d
277 ------------------------------------------------------------------------
278 flowList :: HasNodeError err => UserId -> CorpusId
279 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
280 flowList uId cId _ngs = do
281 -- printDebug "ngs:" ngs
282 lId <- getOrMkList cId uId
283 --printDebug "ngs" (DM.keys ngs)
284 -- TODO add stemming equivalence of 2 ngrams
286 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
287 -- _ <- insertGroups lId groupEd
289 -- compute Candidate / Map
290 --is <- insertLists lId $ ngrams2list ngs
291 --printDebug "listNgrams inserted :" is
295 flowListUser :: RepoCmdM env err m
296 => UserId -> CorpusId -> Int -> m NodeId
297 flowListUser uId cId n = do
298 lId <- getOrMkList cId uId
299 -- is <- insertLists lId $ ngrams2list ngs
301 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
302 -- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
304 insertNewListOfNgramsElements lId $
305 DM.singleton NgramsTerms
306 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
311 ------------------------------------------------------------------------
316 * DM.keys called twice
317 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
318 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
319 -> Map NgramsIndexed NgramsIndexed
320 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
324 -- TODO check: do not insert duplicates
325 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
326 insertGroups lId ngrs =
327 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
328 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
332 ------------------------------------------------------------------------
333 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
334 -> [(ListType, (NgramsType,NgramsIndexed))]
336 [ (CandidateList, (t, ng))
337 | (ng, tm) <- DM.toList m
341 -- | TODO: weight of the list could be a probability
342 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
343 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
344 | (l,(ngt, ng)) <- lngs
346 ------------------------------------------------------------------------