src/Gargantext/Database/Flow.hs

   1 {-|
   2 Module      : Gargantext.Database.Flow
   3 Description : Database Flow
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10
  11 Map (NgramsId, NodeId) -> insert
  12 data NgramsType = Sources | Authors | Terms
  13 nodes_ngrams : column type, column list
  14
  15 documents
  16 sources
  17 authors
  18
  19 -}
  20
  21 {-# LANGUAGE DeriveGeneric     #-}
  22 {-# LANGUAGE NoImplicitPrelude #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Database.Flow (flowDatabase)
  26     where
  27
  28 import GHC.Show (Show)
  29 import System.FilePath (FilePath)
  30 import Data.Maybe (Maybe(..), catMaybes)
  31 import Data.Text (Text, splitOn)
  32 import Data.Map (Map)
  33 import Data.Tuple.Extra (both, second)
  34 import qualified Data.Map as DM
  35
  36 import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
  37 import Gargantext.Database.Bashql (runCmd')--, del)
  38 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
  39 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
  40 import Gargantext.Database.Node.Document.Add    (add)
  41 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
  42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
  43 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
  44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  45 import Gargantext.Database.User (getUser, UserLight(..), Username)
  46 import Gargantext.Prelude
  47 import Gargantext.Text.Parsers (parseDocs, FileFormat)
  48 import Gargantext.Ext.IMT (toSchoolName)
  49
  50 type UserId = Int
  51 type RootId = Int
  52 type CorpusId = Int
  53
  54 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO [Int]
  55 flowDatabase ff fp cName = do
  56
  57   -- Corus Flow
  58   (masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
  59
  60   -- Documents Flow
  61   hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
  62
  63   --printDebug "hyperdataDocuments" hyperdataDocuments
  64
  65   ids  <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
  66   --printDebug "Docs IDs : " (ids)
  67   idsRepeat  <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
  68   --printDebug "Repeated Docs IDs : " (length ids)
  69
  70   -- Ngrams Flow
  71   -- todo: flow for new documents only
  72   let tids = toInserted ids
  73   --printDebug "toInserted ids" (length tids, tids)
  74
  75   let tihs = toInsert hyperdataDocuments
  76   --printDebug "toInsert hyperdataDocuments" (length tihs, tihs)
  77
  78   let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
  79   -- printDebug "documentsWithId" documentsWithId
  80
  81   let docsWithNgrams  = documentIdWithNgrams extractNgramsT documentsWithId
  82   printDebug "docsWithNgrams" docsWithNgrams
  83   {-
  84
  85   let maps            = mapNodeIdNgrams docsWithNgrams
  86   printDebug "maps" (maps)
  87
  88   indexedNgrams <- runCmd' $ indexNgrams maps
  89   printDebug "inserted ngrams" indexedNgrams
  90   _             <- runCmd' $ insertToNodeNgrams indexedNgrams
  91
  92   -- List Flow
  93   listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
  94   printDebug "list id : " listId2
  95
  96   printDebug "Docs IDs : " (length idsRepeat)
  97
  98   -}
  99   (_, _, corpusId2) <- subFlow "user1" cName
 100   {-
 101   inserted <- runCmd' $ add corpusId2 (map reId ids)
 102   printDebug "Inserted : " (length inserted)
 103   -}
 104   pure [corpusId2, corpusId]
 105
 106   --runCmd' $ del [corpusId2, corpusId]
 107
 108 type CorpusName = Text
 109
 110 subFlow :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
 111 subFlow username cName = do
 112   maybeUserId <- runCmd' (getUser username)
 113
 114   let userId = case maybeUserId of
 115         Nothing   -> panic "Error: User does not exist (yet)"
 116         -- mk NodeUser gargantua_id "Node Gargantua"
 117         Just user -> userLight_id user
 118
 119   rootId' <- map _node_id <$> runCmd' (getRoot userId)
 120
 121   rootId'' <- case rootId' of
 122         []  -> runCmd' (mkRoot username userId)
 123         n   -> case length n >= 2 of
 124             True  -> panic "Error: more than 1 userNode / user"
 125             False -> pure rootId'
 126   let rootId = maybe (panic "error rootId") identity (head rootId'')
 127
 128   corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
 129   let corpusId = maybe (panic "error corpusId") identity (head corpusId')
 130
 131   printDebug "(username, userId, rootId, corpusId)"
 132               (username, userId, rootId, corpusId)
 133   pure (userId, rootId, corpusId)
 134
 135 ------------------------------------------------------------------------
 136
 137 type HashId   = Text
 138 type NodeId   = Int
 139 type ListId   = Int
 140
 141 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
 142 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
 143   where
 144     hash = maybe "Error" identity
 145
 146 toInserted :: [ReturnId] -> Map HashId ReturnId
 147 toInserted rs = DM.fromList $ map    (\r ->  (reUniqId r, r)    )
 148                             $ filter (\r -> reInserted r == True) rs
 149
 150 data DocumentWithId =
 151      DocumentWithId { documentId   :: NodeId
 152                     , documentData :: HyperdataDocument
 153                     } deriving (Show)
 154
 155 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
 156 mergeData rs hs = map (\(hash,hpd) -> DocumentWithId (lookup' hash rs) hpd) $ DM.toList hs
 157                   where
 158                     lookup' h xs = maybe (panic $ "Database.Flow.mergeData: Error with " <> h) reId (DM.lookup h rs)
 159
 160 ------------------------------------------------------------------------
 161
 162 data DocumentIdWithNgrams =
 163      DocumentIdWithNgrams
 164      { documentWithId  :: DocumentWithId
 165      , document_ngrams :: Map (NgramsT Ngrams) Int
 166      } deriving (Show)
 167
 168 -- TODO add Terms (Title + Abstract)
 169 -- add f :: Text -> Text
 170 -- newtype Ngrams = Ngrams Text
 171 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
 172 extractNgramsT doc = DM.fromList $  [(NgramsT Sources source, 1)]
 173                                  <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
 174                                  <> [(NgramsT Authors    a' , 1)| a' <- authors    ]
 175   where
 176     source    = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
 177     institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))  $ _hyperdataDocument_institutes doc
 178     authors    = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
 179     -- TODO group terms
 180
 181 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
 182                      -> [DocumentWithId]   -> [DocumentIdWithNgrams]
 183 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
 184
 185 -- | TODO check optimization
 186 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
 187 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
 188   where
 189     xs  = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
 190     n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
 191
 192 indexNgrams :: Map (NgramsT Ngrams       ) (Map NodeId Int)
 193        -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
 194 indexNgrams ng2nId = do
 195   terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
 196   pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
 197
 198
 199 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
 200 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId  ((_ngramsId    . _ngramsT   ) ng)
 201                                                 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
 202                                           | (ng, nId2int) <- DM.toList m
 203                                           , (nId, n)      <- DM.toList nId2int
 204                                         ]
 205
 206
 207 ------------------------------------------------------------------------
 208 ------------------------------------------------------------------------
 209 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
 210 listFlow uId cId ng = do
 211   lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
 212   -- TODO add stemming equivalence of 2 ngrams
 213   let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ng
 214   _ <- insertGroups lId groupEd
 215
 216 -- compute Candidate / Map
 217   let lists = ngrams2list ng
 218   _ <- insertLists lId lists
 219
 220   pure lId
 221
 222 ------------------------------------------------------------------------
 223
 224 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
 225               -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
 226               -> Map NgramsIndexed NgramsIndexed
 227 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
 228
 229
 230
 231 -- TODO check: do not insert duplicates
 232 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
 233 insertGroups lId ngrs =
 234   insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
 235                               | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
 236                             ]
 237
 238 ------------------------------------------------------------------------
 239 -- TODO: verify NgramsT lost here
 240 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map ListType NgramsIndexed
 241 ngrams2list = DM.fromList . zip (repeat Candidate) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
 242
 243 -- | TODO: weight of the list could be a probability
 244 insertLists :: ListId -> Map ListType NgramsIndexed -> Cmd Int
 245 insertLists lId list2ngrams =
 246   insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listId l) (listId l)
 247                      | (l,ngr) <- map (second _ngramsId)   $ DM.toList list2ngrams
 248                    ]
 249
 250 ------------------------------------------------------------------------
 251 ------------------------------------------------------------------------
 252