]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[CONTACT] entry/departure time
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow (flowDatabase, ngrams2list)
17 where
18
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
26
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard, mkAnnuaire)
32 import Gargantext.Database.Node.Document.Add (add)
33 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds, Hyper(HyperDocument))
34 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
35 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Database.Node.Contact (HyperdataContact(..))
38 import Gargantext.Database.User (getUser, UserLight(..), Username)
39 import Gargantext.Ext.IMT (toSchoolName)
40 import Gargantext.Prelude
41 import Gargantext.Text.Parsers (parseDocs, FileFormat)
42
43 type UserId = Int
44 type RootId = Int
45 type CorpusId = Int
46
47 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO Int
48 flowDatabase ff fp cName = do
49
50 -- Corus Flow
51 (masterUserId, _, corpusId) <- subFlowCorpus userMaster corpusMasterName
52
53 -- Documents Flow
54 hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
55 let hyperdataDocuments' = map (\h -> HyperDocument h) hyperdataDocuments
56 printDebug "hyperdataDocuments" hyperdataDocuments
57
58 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
59 --printDebug "Docs IDs : " (ids)
60 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
61 printDebug "Repeated Docs IDs : " (length idsRepeat)
62
63 -- Ngrams Flow
64 -- todo: flow for new documents only
65 let tids = toInserted ids
66 printDebug "toInserted ids" (length tids)
67
68 let tihs = toInsert hyperdataDocuments
69 printDebug "toInsert hyperdataDocuments" (length tihs)
70
71 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
72 printDebug "documentsWithId" documentsWithId
73
74 -- docsWithNgrams <- documentIdWithNgrams documentsWithId extractNgramsT
75 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
76 printDebug "docsWithNgrams" docsWithNgrams
77
78 let maps = mapNodeIdNgrams docsWithNgrams
79 printDebug "maps" (maps)
80
81 indexedNgrams <- runCmd' $ indexNgrams maps
82 printDebug "inserted ngrams" indexedNgrams
83 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
84
85 -- List Flow
86 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
87 printDebug "list id : " listId2
88
89 (userId, rootUserId, corpusId2) <- subFlowCorpus userArbitrary cName
90
91 userListId <- runCmd' $ listFlowUser userId corpusId2
92 printDebug "UserList : " userListId
93
94 inserted <- runCmd' $ add corpusId2 (map reId ids)
95 printDebug "Inserted : " (length inserted)
96
97 _ <- runCmd' $ mkDashboard corpusId2 userId
98 _ <- runCmd' $ mkGraph corpusId2 userId
99
100 -- Annuaire Flow
101 annuaireId <- runCmd' $ mkAnnuaire rootUserId userId
102
103 pure corpusId2
104 -- runCmd' $ del [corpusId2, corpusId]
105
106 type CorpusName = Text
107
108 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
109 subFlowCorpus username cName = do
110 maybeUserId <- runCmd' (getUser username)
111
112 let userId = case maybeUserId of
113 Nothing -> panic "Error: User does not exist (yet)"
114 -- mk NodeUser gargantua_id "Node Gargantua"
115 Just user -> userLight_id user
116
117 rootId' <- map _node_id <$> runCmd' (getRoot userId)
118
119 rootId'' <- case rootId' of
120 [] -> runCmd' (mkRoot username userId)
121 n -> case length n >= 2 of
122 True -> panic "Error: more than 1 userNode / user"
123 False -> pure rootId'
124 let rootId = maybe (panic "error rootId") identity (head rootId'')
125
126 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
127 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
128
129 printDebug "(username, userId, rootId, corpusId)"
130 (username, userId, rootId, corpusId)
131 pure (userId, rootId, corpusId)
132
133 ------------------------------------------------------------------------
134
135 type HashId = Text
136 type NodeId = Int
137 type ListId = Int
138
139 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
140 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
141 where
142 hash = maybe "Error" identity
143
144 toInserted :: [ReturnId] -> Map HashId ReturnId
145 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
146 $ filter (\r -> reInserted r == True) rs
147
148 data DocumentWithId =
149 DocumentWithId { documentId :: NodeId
150 , documentData :: HyperdataDocument
151 } deriving (Show)
152
153 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
154 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
155 where
156 toDocumentWithId (hash,hpd) =
157 DocumentWithId <$> fmap reId (lookup hash rs)
158 <*> Just hpd
159
160 ------------------------------------------------------------------------
161
162 data DocumentIdWithNgrams =
163 DocumentIdWithNgrams
164 { documentWithId :: DocumentWithId
165 , document_ngrams :: Map (NgramsT Ngrams) Int
166 } deriving (Show)
167
168 -- TODO add Terms (Title + Abstract)
169 -- add f :: Text -> Text
170 -- newtype Ngrams = Ngrams Text
171 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
172 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
173 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
174 <> [(NgramsT Authors a' , 1)| a' <- authors ]
175 where
176 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
177 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
178 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
179 -- TODO group terms
180
181 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
182 -> [DocumentWithId] -> [DocumentIdWithNgrams]
183 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
184
185 -- | TODO check optimization
186 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
187 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
188 where
189 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
190 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
191
192 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
193 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
194 indexNgrams ng2nId = do
195 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
196 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
197
198
199 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
200 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
201 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
202 | (ng, nId2int) <- DM.toList m
203 , (nId, n) <- DM.toList nId2int
204 ]
205
206
207 ------------------------------------------------------------------------
208 ------------------------------------------------------------------------
209 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
210 listFlow uId cId ngs = do
211 printDebug "ngs:" ngs
212 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
213 printDebug "ngs" (DM.keys ngs)
214 -- TODO add stemming equivalence of 2 ngrams
215 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
216 _ <- insertGroups lId groupEd
217
218 -- compute Candidate / Map
219 let lists = ngrams2list ngs
220 printDebug "lists:" lists
221
222 is <- insertLists lId lists
223 printDebug "listNgrams inserted :" is
224
225 pure lId
226
227 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
228 listFlowUser uId cId = mkList cId uId
229
230 ------------------------------------------------------------------------
231
232 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
233 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
234 -> Map NgramsIndexed NgramsIndexed
235 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
236
237
238
239 -- TODO check: do not insert duplicates
240 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
241 insertGroups lId ngrs =
242 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
243 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
244 , ng1 /= ng2
245 ]
246
247 ------------------------------------------------------------------------
248 -- TODO: verify NgramsT lost here
249 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
250 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
251
252 -- | TODO: weight of the list could be a probability
253 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
254 insertLists lId lngs =
255 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
256 | (l,ngr) <- map (second _ngramsId) lngs
257 ]
258
259 ------------------------------------------------------------------------
260 ------------------------------------------------------------------------
261