2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
19 import GHC.Show (Show)
20 --import Control.Lens (view)
21 import System.FilePath (FilePath)
22 import Data.Maybe (Maybe(..), catMaybes)
23 import Data.Text (Text, splitOn)
24 import Data.Map (Map, lookup)
25 import Data.Tuple.Extra (both, second)
26 import qualified Data.Map as DM
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId')
32 import Gargantext.Database.Root (getRootCmd)
33 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
34 import Gargantext.Database.Node.Document.Add (add)
35 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
36 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
37 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
38 import Gargantext.Database.Types.Node (HyperdataDocument(..))
39 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
40 import Gargantext.Database.User (getUser, UserLight(..))
41 import Gargantext.Core.Types.Individu (Username)
42 import Gargantext.Ext.IMT (toSchoolName)
43 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
44 import Gargantext.Prelude
45 import Gargantext.Text.Parsers (parseDocs, FileFormat)
46 import Gargantext.Core.Types.Main
47 --import Gargantext.Core.Types
48 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
50 flowCorpus :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
51 flowCorpus ff fp cName = do
52 hyperdataDocuments' <- map addUniqIdsDoc <$> parseDocs ff fp
53 params <- flowInsert NodeCorpus hyperdataDocuments' cName
54 flowCorpus' NodeCorpus hyperdataDocuments' params
57 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
58 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
59 flowInsert _nt hyperdataDocuments cName = do
60 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
62 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
63 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
65 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
66 _ <- runCmd' $ add userCorpusId (map reId ids)
68 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
71 flowAnnuaire :: FilePath -> IO ()
72 flowAnnuaire filePath = do
73 contacts <- deserialiseImtUsersFromFile filePath
74 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
75 printDebug "length annuaire" ps
78 flowInsertAnnuaire :: CorpusName -> [ToDbData]
79 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
80 flowInsertAnnuaire name children = do
82 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
83 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeContact children
85 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
86 _ <- runCmd' $ add userCorpusId (map reId ids)
88 printDebug "AnnuaireID" userCorpusId
90 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
93 flowCorpus' :: NodeType -> [HyperdataDocument]
94 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
96 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
97 --------------------------------------------------
99 userListId <- runCmd' $ flowListUser userId userCorpusId
100 printDebug "Working on User ListId : " userListId
102 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
103 -- printDebug "documentsWithId" documentsWithId
104 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
105 -- printDebug "docsWithNgrams" docsWithNgrams
106 let maps = mapNodeIdNgrams docsWithNgrams
108 -- printDebug "maps" (maps)
109 indexedNgrams <- runCmd' $ indexNgrams maps
110 -- printDebug "inserted ngrams" indexedNgrams
111 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
113 listId2 <- runCmd' $ flowList masterUserId masterCorpusId indexedNgrams
114 printDebug "Working on ListId : " listId2
116 --------------------------------------------------
117 _ <- runCmd' $ mkDashboard userCorpusId userId
118 _ <- runCmd' $ mkGraph userCorpusId userId
121 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
124 -- runCmd' $ del [corpusId2, corpusId]
126 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
127 flowCorpus' _ _ _ = undefined
130 type CorpusName = Text
132 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
133 subFlowCorpus username cName = do
134 maybeUserId <- runCmd' (getUser username)
136 let userId = case maybeUserId of
137 Nothing -> panic "Error: User does not exist (yet)"
138 -- mk NodeUser gargantua_id "Node Gargantua"
139 Just user -> userLight_id user
141 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
143 rootId'' <- case rootId' of
144 [] -> runCmd' (mkRoot username userId)
145 n -> case length n >= 2 of
146 True -> panic "Error: more than 1 userNode / user"
147 False -> pure rootId'
148 let rootId = maybe (panic "error rootId") identity (head rootId'')
150 corpusId'' <- if username == userMaster
152 ns <- runCmd' $ getCorporaWithParentId' rootId
153 pure $ map _node_id ns
158 corpusId' <- if corpusId'' /= []
160 else runCmd' $ mkCorpus (Just cName) Nothing rootId userId
162 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
164 printDebug "(username, userId, rootId, corpusId)"
165 (username, userId, rootId, corpusId)
166 pure (userId, rootId, corpusId)
169 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
170 subFlowAnnuaire username _cName = do
171 maybeUserId <- runCmd' (getUser username)
173 let userId = case maybeUserId of
174 Nothing -> panic "Error: User does not exist (yet)"
175 -- mk NodeUser gargantua_id "Node Gargantua"
176 Just user -> userLight_id user
178 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
180 rootId'' <- case rootId' of
181 [] -> runCmd' (mkRoot username userId)
182 n -> case length n >= 2 of
183 True -> panic "Error: more than 1 userNode / user"
184 False -> pure rootId'
185 let rootId = maybe (panic "error rootId") identity (head rootId'')
187 corpusId' <- runCmd' $ mkAnnuaire rootId userId
189 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
191 printDebug "(username, userId, rootId, corpusId)"
192 (username, userId, rootId, corpusId)
193 pure (userId, rootId, corpusId)
197 ------------------------------------------------------------------------
198 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
199 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
201 err = "Database.Flow.toInsert"
203 toInserted :: [ReturnId] -> Map HashId ReturnId
204 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
205 . filter (\r -> reInserted r == True)
207 data DocumentWithId =
208 DocumentWithId { documentId :: NodeId
209 , documentData :: HyperdataDocument
212 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
213 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
215 toDocumentWithId (hash,hpd) =
216 DocumentWithId <$> fmap reId (lookup hash rs)
219 ------------------------------------------------------------------------
221 data DocumentIdWithNgrams =
223 { documentWithId :: DocumentWithId
224 , document_ngrams :: Map (NgramsT Ngrams) Int
227 -- TODO add Terms (Title + Abstract)
228 -- add f :: Text -> Text
229 -- newtype Ngrams = Ngrams Text
230 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
231 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
232 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
233 <> [(NgramsT Authors a' , 1)| a' <- authors ]
235 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
236 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
237 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
243 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
244 -> [DocumentWithId] -> [DocumentIdWithNgrams]
245 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
247 -- | TODO check optimization
248 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
249 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
251 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
252 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
254 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
255 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
256 indexNgrams ng2nId = do
257 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
258 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
261 ------------------------------------------------------------------------
262 ------------------------------------------------------------------------
263 flowList :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
264 flowList uId cId ngs = do
265 -- printDebug "ngs:" ngs
266 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
267 --printDebug "ngs" (DM.keys ngs)
268 -- TODO add stemming equivalence of 2 ngrams
269 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
270 _ <- insertGroups lId groupEd
272 -- compute Candidate / Map
273 let lists = ngrams2list ngs
274 -- printDebug "lists:" lists
276 is <- insertLists lId lists
277 printDebug "listNgrams inserted :" is
281 flowListUser :: UserId -> CorpusId -> Cmd [Int]
282 flowListUser uId cId = mkList cId uId
284 ------------------------------------------------------------------------
286 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
287 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
288 -> Map NgramsIndexed NgramsIndexed
289 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
293 -- TODO check: do not insert duplicates
294 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
295 insertGroups lId ngrs =
296 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
297 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
301 ------------------------------------------------------------------------
302 -- TODO: verify NgramsT lost here
303 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
304 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
306 -- | TODO: weight of the list could be a probability
307 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
308 insertLists lId lngs =
309 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
310 | (l,ngr) <- map (second _ngramsId) lngs
313 ------------------------------------------------------------------------
314 ------------------------------------------------------------------------