2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Text.Terms (extractTerms)
37 import Gargantext.Database.Node.Document.Add (add)
38 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
39 import Gargantext.Database.Root (getRoot)
40 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams, ngramsTypeId)
41 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
42 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Schema.User (getUser, UserLight(..))
45 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
46 import Gargantext.Database.Utils (Cmd)
47 import Gargantext.Text.Terms (TermType(..))
48 import Gargantext.Ext.IMT (toSchoolName)
49 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
50 import Gargantext.Prelude
51 import Gargantext.Text.Parsers (parseDocs, FileFormat)
52 import System.FilePath (FilePath)
53 import qualified Data.Map as DM
54 import qualified Data.Set as DS
57 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
58 flowCorpus ff fp cName = do
59 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
60 params <- flowInsert NodeCorpus hyperdataDocuments' cName
61 flowCorpus' NodeCorpus hyperdataDocuments' params
64 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
65 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
66 flowInsert _nt hyperdataDocuments cName = do
67 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
69 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
70 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
72 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
73 _ <- add userCorpusId (map reId ids)
75 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
78 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
79 flowAnnuaire filePath = do
80 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
81 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
82 printDebug "length annuaire" ps
85 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
86 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
87 flowInsertAnnuaire name children = do
89 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
90 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
92 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
93 _ <- add userCorpusId (map reId ids)
95 printDebug "AnnuaireID" userCorpusId
97 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
100 flowCorpus' :: HasNodeError err
101 => NodeType -> [HyperdataDocument]
102 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
104 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
105 --------------------------------------------------
107 userListId <- flowListUser userId userCorpusId
108 printDebug "Working on User ListId : " userListId
110 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
111 -- printDebug "documentsWithId" documentsWithId
112 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
113 -- printDebug "docsWithNgrams" docsWithNgrams
114 let maps = mapNodeIdNgrams docsWithNgrams
116 -- printDebug "maps" (maps)
117 terms2id <- insertNgrams (DS.toList $ DS.map _ngramsT (DM.keysSet maps))
118 let indexedNgrams = DM.mapKeys (indexNgramsT terms2id) maps
119 -- printDebug "inserted ngrams" indexedNgrams
120 _ <- insertToNodeNgrams indexedNgrams
122 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
123 printDebug "Working on ListId : " listId2
125 --------------------------------------------------
126 _ <- mkDashboard userCorpusId userId
127 _ <- mkGraph userCorpusId userId
130 -- _ <- mkAnnuaire rootUserId userId
133 -- del [corpusId2, corpusId]
135 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
136 flowCorpus' _ _ _ = undefined
139 type CorpusName = Text
141 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
142 subFlowCorpus username cName = do
143 maybeUserId <- getUser username
145 userId <- case maybeUserId of
146 Nothing -> nodeError NoUserFound
147 -- mk NodeUser gargantua_id "Node Gargantua"
148 Just user -> pure $ userLight_id user
150 rootId' <- map _node_id <$> getRoot username
152 rootId'' <- case rootId' of
153 [] -> mkRoot username userId
154 n -> case length n >= 2 of
155 True -> nodeError ManyNodeUsers
156 False -> pure rootId'
157 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
159 corpusId'' <- if username == userMaster
161 ns <- getCorporaWithParentId rootId
162 pure $ map _node_id ns
166 corpusId' <- if corpusId'' /= []
168 else mkCorpus (Just cName) Nothing rootId userId
170 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
172 printDebug "(username, userId, rootId, corpusId)"
173 (username, userId, rootId, corpusId)
174 pure (userId, rootId, corpusId)
177 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
178 subFlowAnnuaire username _cName = do
179 maybeUserId <- getUser username
181 userId <- case maybeUserId of
182 Nothing -> nodeError NoUserFound
183 -- mk NodeUser gargantua_id "Node Gargantua"
184 Just user -> pure $ userLight_id user
186 rootId' <- map _node_id <$> getRoot username
188 rootId'' <- case rootId' of
189 [] -> mkRoot username userId
190 n -> case length n >= 2 of
191 True -> nodeError ManyNodeUsers
192 False -> pure rootId'
193 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
195 corpusId' <- mkAnnuaire rootId userId
197 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
199 printDebug "(username, userId, rootId, corpusId)"
200 (username, userId, rootId, corpusId)
201 pure (userId, rootId, corpusId)
203 ------------------------------------------------------------------------
204 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
205 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
207 err = "Database.Flow.toInsert"
209 toInserted :: [ReturnId] -> Map HashId ReturnId
210 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
211 . filter (\r -> reInserted r == True)
213 data DocumentWithId =
214 DocumentWithId { documentId :: !NodeId
215 , documentData :: !HyperdataDocument
218 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
219 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
221 toDocumentWithId (hash,hpd) =
222 DocumentWithId <$> fmap reId (lookup hash rs)
225 ------------------------------------------------------------------------
226 data DocumentIdWithNgrams =
228 { documentWithId :: !DocumentWithId
229 , document_ngrams :: !(Map (NgramsT Ngrams) Int)
233 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map (NgramsT Ngrams) Int)
234 extractNgramsT doc = do
236 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
237 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
238 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
239 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
240 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
242 pure $ DM.fromList $ [(NgramsT Sources source, 1)]
243 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
244 <> [(NgramsT Authors a' , 1)| a' <- authors ]
245 <> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
249 documentIdWithNgrams :: HasNodeError err => (HyperdataDocument -> Cmd err (Map (NgramsT Ngrams) Int))
250 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
251 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
253 toDocumentIdWithNgrams d = do
254 e <- f $ documentData d
255 pure $ DocumentIdWithNgrams d e
257 -- | TODO check optimization
258 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
259 mapNodeIdNgrams ds = DM.fromListWith (DM.unionWith (+)) xs
261 xs = [(ng, DM.singleton nId i) | (nId, n2i') <- ds', (ng, i) <- DM.toList n2i']
262 ds' = (\d -> ((documentId . documentWithId) d, document_ngrams d)) <$> ds
264 ------------------------------------------------------------------------
265 flowList :: HasNodeError err => UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd err ListId
266 flowList uId cId ngs = do
267 -- printDebug "ngs:" ngs
268 lId <- getOrMkList cId uId
269 --printDebug "ngs" (DM.keys ngs)
270 -- TODO add stemming equivalence of 2 ngrams
271 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
272 _ <- insertGroups lId groupEd
274 -- compute Candidate / Map
275 is <- insertLists lId $ ngrams2list ngs
276 printDebug "listNgrams inserted :" is
280 flowListUser :: HasNodeError err => UserId -> CorpusId -> Cmd err NodeId
281 flowListUser uId cId = getOrMkList cId uId
283 ------------------------------------------------------------------------
285 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
286 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
287 -> Map NgramsIndexed NgramsIndexed
288 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
292 -- TODO check: do not insert duplicates
293 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
294 insertGroups lId ngrs =
295 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
296 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
300 ------------------------------------------------------------------------
301 -- TODO: verify NgramsT lost here
302 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType, (NgramsType,NgramsIndexed))]
303 ngrams2list = zip (repeat GraphList) . map (\(NgramsT ngt ng) -> (ngt, ng)) . DM.keys
305 -- | TODO: weight of the list could be a probability
306 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
307 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
308 | (l,(ngt, ng)) <- lngs
310 ------------------------------------------------------------------------