]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[NGRAMS][LIST] getOrMk List.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
17 where
18
19 --import Control.Lens (view)
20 --import Gargantext.Core.Types
21 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
22 import Data.Map (Map, lookup)
23 import Data.Maybe (Maybe(..), catMaybes)
24 import Data.Text (Text, splitOn)
25 import Data.Tuple.Extra (both, second)
26 import GHC.Show (Show)
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Core.Types.Individu (Username)
29 import Gargantext.Core.Types.Main
30 import Gargantext.Database.Bashql (runCmd') -- , del)
31 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
32 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
33 import Gargantext.Database.Node.Document.Add (add)
34 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
35 import Gargantext.Database.Root (getRootCmd)
36 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
37 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId')
38 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
39 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
40 import Gargantext.Database.Schema.User (getUser, UserLight(..))
41 import Gargantext.Database.Types.Node (HyperdataDocument(..))
42 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
43 import Gargantext.Database.Utils (Cmd(..))
44 import Gargantext.Ext.IMT (toSchoolName)
45 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
46 import Gargantext.Prelude
47 import Gargantext.Text.Parsers (parseDocs, FileFormat)
48 import System.FilePath (FilePath)
49 import qualified Data.Map as DM
50
51 flowCorpus :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
52 flowCorpus ff fp cName = do
53 hyperdataDocuments' <- map addUniqIdsDoc <$> parseDocs ff fp
54 params <- flowInsert NodeCorpus hyperdataDocuments' cName
55 flowCorpus' NodeCorpus hyperdataDocuments' params
56
57
58 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
59 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
60 flowInsert _nt hyperdataDocuments cName = do
61 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
62
63 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
64 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
65
66 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
67 _ <- runCmd' $ add userCorpusId (map reId ids)
68
69 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
70
71
72 flowAnnuaire :: FilePath -> IO ()
73 flowAnnuaire filePath = do
74 contacts <- deserialiseImtUsersFromFile filePath
75 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
76 printDebug "length annuaire" ps
77
78
79 flowInsertAnnuaire :: CorpusName -> [ToDbData]
80 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
81 flowInsertAnnuaire name children = do
82
83 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
84 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeContact children
85
86 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
87 _ <- runCmd' $ add userCorpusId (map reId ids)
88
89 printDebug "AnnuaireID" userCorpusId
90
91 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
92
93
94 flowCorpus' :: NodeType -> [HyperdataDocument]
95 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
96 -> IO CorpusId
97 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
98 --------------------------------------------------
99 -- List Ngrams Flow
100 userListId <- runCmd' $ flowListUser userId userCorpusId
101 printDebug "Working on User ListId : " userListId
102
103 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
104 -- printDebug "documentsWithId" documentsWithId
105 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
106 -- printDebug "docsWithNgrams" docsWithNgrams
107 let maps = mapNodeIdNgrams docsWithNgrams
108
109 -- printDebug "maps" (maps)
110 indexedNgrams <- runCmd' $ indexNgrams maps
111 -- printDebug "inserted ngrams" indexedNgrams
112 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
113
114 listId2 <- runCmd' $ flowList masterUserId masterCorpusId indexedNgrams
115 printDebug "Working on ListId : " listId2
116 --}
117 --------------------------------------------------
118 _ <- runCmd' $ mkDashboard userCorpusId userId
119 _ <- runCmd' $ mkGraph userCorpusId userId
120
121 -- Annuaire Flow
122 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
123
124 pure userCorpusId
125 -- runCmd' $ del [corpusId2, corpusId]
126
127 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
128 flowCorpus' _ _ _ = undefined
129
130
131 type CorpusName = Text
132
133 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
134 subFlowCorpus username cName = do
135 maybeUserId <- runCmd' (getUser username)
136
137 let userId = case maybeUserId of
138 Nothing -> panic "Error: User does not exist (yet)"
139 -- mk NodeUser gargantua_id "Node Gargantua"
140 Just user -> userLight_id user
141
142 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
143
144 rootId'' <- case rootId' of
145 [] -> runCmd' (mkRoot username userId)
146 n -> case length n >= 2 of
147 True -> panic "Error: more than 1 userNode / user"
148 False -> pure rootId'
149 let rootId = maybe (panic "error rootId") identity (head rootId'')
150 --{-
151 corpusId'' <- if username == userMaster
152 then do
153 ns <- runCmd' $ getCorporaWithParentId' rootId
154 pure $ map _node_id ns
155 else
156 pure []
157
158 --}
159 corpusId' <- if corpusId'' /= []
160 then pure corpusId''
161 else runCmd' $ mkCorpus (Just cName) Nothing rootId userId
162
163 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
164
165 printDebug "(username, userId, rootId, corpusId)"
166 (username, userId, rootId, corpusId)
167 pure (userId, rootId, corpusId)
168
169
170 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
171 subFlowAnnuaire username _cName = do
172 maybeUserId <- runCmd' (getUser username)
173
174 let userId = case maybeUserId of
175 Nothing -> panic "Error: User does not exist (yet)"
176 -- mk NodeUser gargantua_id "Node Gargantua"
177 Just user -> userLight_id user
178
179 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
180
181 rootId'' <- case rootId' of
182 [] -> runCmd' (mkRoot username userId)
183 n -> case length n >= 2 of
184 True -> panic "Error: more than 1 userNode / user"
185 False -> pure rootId'
186 let rootId = maybe (panic "error rootId") identity (head rootId'')
187
188 corpusId' <- runCmd' $ mkAnnuaire rootId userId
189
190 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
191
192 printDebug "(username, userId, rootId, corpusId)"
193 (username, userId, rootId, corpusId)
194 pure (userId, rootId, corpusId)
195
196
197
198 ------------------------------------------------------------------------
199 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
200 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
201 where
202 err = "Database.Flow.toInsert"
203
204 toInserted :: [ReturnId] -> Map HashId ReturnId
205 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
206 . filter (\r -> reInserted r == True)
207
208 data DocumentWithId =
209 DocumentWithId { documentId :: NodeId
210 , documentData :: HyperdataDocument
211 } deriving (Show)
212
213 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
214 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
215 where
216 toDocumentWithId (hash,hpd) =
217 DocumentWithId <$> fmap reId (lookup hash rs)
218 <*> Just hpd
219
220 ------------------------------------------------------------------------
221
222 data DocumentIdWithNgrams =
223 DocumentIdWithNgrams
224 { documentWithId :: DocumentWithId
225 , document_ngrams :: Map (NgramsT Ngrams) Int
226 } deriving (Show)
227
228 -- TODO add Terms (Title + Abstract)
229 -- add f :: Text -> Text
230 -- newtype Ngrams = Ngrams Text
231 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
232 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
233 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
234 <> [(NgramsT Authors a' , 1)| a' <- authors ]
235 where
236 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
237 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
238 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
239 -- TODO group terms
240
241
242
243
244 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
245 -> [DocumentWithId] -> [DocumentIdWithNgrams]
246 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
247
248 -- | TODO check optimization
249 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
250 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
251 where
252 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
253 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
254
255 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
256 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
257 indexNgrams ng2nId = do
258 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
259 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
260
261
262 ------------------------------------------------------------------------
263 ------------------------------------------------------------------------
264 flowList :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
265 flowList uId cId ngs = do
266 -- printDebug "ngs:" ngs
267 lId <- getOrMkList cId uId
268 --printDebug "ngs" (DM.keys ngs)
269 -- TODO add stemming equivalence of 2 ngrams
270 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
271 _ <- insertGroups lId groupEd
272
273 -- compute Candidate / Map
274 let lists = ngrams2list ngs
275 -- printDebug "lists:" lists
276
277 is <- insertLists lId lists
278 printDebug "listNgrams inserted :" is
279
280 pure lId
281
282 flowListUser :: UserId -> CorpusId -> Cmd Int
283 flowListUser uId cId = getOrMkList cId uId
284
285 ------------------------------------------------------------------------
286
287 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
288 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
289 -> Map NgramsIndexed NgramsIndexed
290 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
291
292
293
294 -- TODO check: do not insert duplicates
295 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
296 insertGroups lId ngrs =
297 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
298 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
299 , ng1 /= ng2
300 ]
301
302 ------------------------------------------------------------------------
303 -- TODO: verify NgramsT lost here
304 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
305 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
306
307 -- | TODO: weight of the list could be a probability
308 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
309 insertLists lId lngs =
310 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
311 | (l,ngr) <- map (second _ngramsId) lngs
312 ]
313
314 ------------------------------------------------------------------------
315 ------------------------------------------------------------------------
316