src/Gargantext/Database/Action/Flow.hs

   1 {-|
   2 Module      : Gargantext.Database.Flow
   3 Description : Database Flow
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -- TODO-ACCESS:
  11 --   check userId       CanFillUserCorpus   userCorpusId
  12 --   check masterUserId CanFillMasterCorpus masterCorpusId
  13
  14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
  15 -- TODO-EVENTS: InsertedNodes
  16 -}
  17
  18 {-# OPTIONS_GHC -fno-warn-orphans #-}
  19
  20 {-# LANGUAGE ConstrainedClassMethods #-}
  21 {-# LANGUAGE ConstraintKinds         #-}
  22 {-# LANGUAGE InstanceSigs            #-}
  23 {-# LANGUAGE ScopedTypeVariables     #-}
  24 {-# LANGUAGE TemplateHaskell         #-}
  25
  26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
  27   ( DataText(..)
  28   , getDataText
  29   , getDataText_Debug
  30   , flowDataText
  31   , flow
  32
  33   , flowCorpusFile
  34   , flowCorpus
  35   , flowAnnuaire
  36   , insertMasterDocs
  37   , saveDocNgramsWith
  38
  39   , getOrMkRoot
  40   , getOrMk_RootWithCorpus
  41   , TermType(..)
  42   , DataOrigin(..)
  43   , allDataOrigins
  44
  45   , do_api
  46   , indexAllDocumentsWithPosTag
  47   )
  48     where
  49
  50 import Conduit
  51 import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
  52 import Data.Aeson.TH (deriveJSON)
  53 import Data.Conduit.Internal (zipSources)
  54 import Data.Either
  55 import Data.HashMap.Strict (HashMap)
  56 import Data.Hashable (Hashable)
  57 import Data.List (concat)
  58 import Data.Map (Map, lookup)
  59 import Data.Maybe (catMaybes)
  60 import Data.Monoid
  61 import Data.Swagger
  62 import qualified Data.Text as T
  63 import Data.Tuple.Extra (first, second)
  64 import GHC.Generics (Generic)
  65 import Servant.Client (ClientError)
  66 import System.FilePath (FilePath)
  67 import qualified Data.HashMap.Strict as HashMap
  68 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  69 import qualified Data.Map as Map
  70 import qualified Data.Conduit.List as CL
  71 import qualified Data.Conduit      as C
  72
  73 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
  74 import Gargantext.Core (Lang(..), PosTagAlgo(..))
  75 import Gargantext.Core.Ext.IMT (toSchoolName)
  76 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
  77 import Gargantext.Core.Flow.Types
  78 import Gargantext.Core.Text
  79 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
  80 import Gargantext.Core.Text.List (buildNgramsLists)
  81 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
  82 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
  83 import Gargantext.Core.Text.Terms
  84 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
  85 import Gargantext.Core.Types (POS(NP), TermsCount)
  86 import Gargantext.Core.Types.Individu (User(..))
  87 import Gargantext.Core.Types.Main
  88 import Gargantext.Core.Utils (addTuples)
  89 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
  90 import Gargantext.Database.Action.Flow.List
  91 import Gargantext.Database.Action.Flow.Types
  92 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
  93 import Gargantext.Database.Action.Search (searchDocInDatabase)
  94 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
  95 import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
  96 import Gargantext.Database.Admin.Types.Hyperdata
  97 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
  98 import Gargantext.Database.Prelude
  99 import Gargantext.Database.Query.Table.ContextNodeNgrams2
 100 import Gargantext.Database.Query.Table.Ngrams
 101 import Gargantext.Database.Query.Table.Node
 102 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
 103 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
 104 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
 105 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
 106 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
 107 import Gargantext.Database.Types
 108 import Gargantext.Prelude
 109 import Gargantext.Prelude.Crypto.Hash (Hash)
 110 import qualified Gargantext.Core.Text.Corpus.API as API
 111 import qualified Gargantext.Database.Query.Table.Node.Document.Add  as Doc  (add)
 112 import qualified Prelude
 113
 114 ------------------------------------------------------------------------
 115 -- Imports for upgrade function
 116 import Gargantext.Database.Query.Tree.Root (getRootId)
 117 import Gargantext.Database.Query.Tree (findNodesId)
 118 import qualified Data.List as List
 119 ------------------------------------------------------------------------
 120 -- TODO use internal with API name (could be old data)
 121 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
 122                 | ExternalOrigin { _do_api :: API.ExternalAPIs }
 123                -- TODO Web
 124   deriving (Generic, Eq)
 125
 126 makeLenses ''DataOrigin
 127 deriveJSON (unPrefix "_do_") ''DataOrigin
 128 instance ToSchema DataOrigin where
 129   declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
 130
 131 allDataOrigins :: [DataOrigin]
 132 allDataOrigins = map InternalOrigin API.externalAPIs
 133               <> map ExternalOrigin API.externalAPIs
 134
 135 ---------------
 136 data DataText = DataOld ![NodeId]
 137               | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
 138               --- | DataNew ![[HyperdataDocument]]
 139
 140 -- Show instance is not possible because of IO
 141 printDataText :: DataText -> IO ()
 142 printDataText (DataOld xs) = putStrLn $ show xs
 143 printDataText (DataNew (maybeInt, conduitData)) = do
 144   res <- C.runConduit (conduitData .| CL.consume)
 145   putStrLn $ show (maybeInt, res)
 146
 147 -- TODO use the split parameter in config file
 148 getDataText :: FlowCmdM env err m
 149             => DataOrigin
 150             -> TermType Lang
 151             -> API.Query
 152             -> Maybe API.Limit
 153             -> m (Either ClientError DataText)
 154 getDataText (ExternalOrigin api) la q li = liftBase $ do
 155   eRes <- API.get api (_tt_lang la) q li
 156   pure $ DataNew <$> eRes
 157
 158 getDataText (InternalOrigin _) _la q _li = do
 159   (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
 160                                            (UserName userMaster)
 161                                            (Left "")
 162                                            (Nothing :: Maybe HyperdataCorpus)
 163   ids <-  map fst <$> searchDocInDatabase cId (stemIt q)
 164   pure $ Right $ DataOld ids
 165
 166 getDataText_Debug :: FlowCmdM env err m
 167             => DataOrigin
 168             -> TermType Lang
 169             -> API.Query
 170             -> Maybe API.Limit
 171             -> m ()
 172 getDataText_Debug a l q li = do
 173   result <- getDataText a l q li
 174   case result of
 175     Left  err -> liftBase $ putStrLn $ show err
 176     Right res -> liftBase $ printDataText res
 177
 178
 179 -------------------------------------------------------------------------------
 180 flowDataText :: forall env err m.
 181                 ( FlowCmdM env err m
 182                 )
 183                 => User
 184                 -> DataText
 185                 -> TermType Lang
 186                 -> CorpusId
 187                 -> Maybe FlowSocialListWith
 188                 -> (JobLog -> m ())
 189                 -> m CorpusId
 190 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
 191   where
 192     corpusType = (Nothing :: Maybe HyperdataCorpus)
 193 flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
 194   flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
 195
 196 ------------------------------------------------------------------------
 197 -- TODO use proxy
 198 flowAnnuaire :: (FlowCmdM env err m)
 199              => User
 200              -> Either CorpusName [CorpusId]
 201              -> (TermType Lang)
 202              -> FilePath
 203              -> (JobLog -> m ())
 204              -> m AnnuaireId
 205 flowAnnuaire u n l filePath logStatus = do
 206   -- TODO Conduit for file
 207   docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
 208   flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
 209
 210 ------------------------------------------------------------------------
 211 flowCorpusFile :: (FlowCmdM env err m)
 212            => User
 213            -> Either CorpusName [CorpusId]
 214            -> Limit -- Limit the number of docs (for dev purpose)
 215            -> TermType Lang
 216            -> FileType
 217            -> FileFormat
 218            -> FilePath
 219            -> Maybe FlowSocialListWith
 220            -> (JobLog -> m ())
 221            -> m CorpusId
 222 flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
 223   eParsed <- liftBase $ parseFile ft ff fp
 224   case eParsed of
 225     Right parsed -> do
 226       flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
 227       --let docs = splitEvery 500 $ take l parsed
 228       --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
 229     Left e       -> panic $ "Error: " <> T.pack e
 230
 231 ------------------------------------------------------------------------
 232 -- | TODO improve the needed type to create/update a corpus
 233 -- (For now, Either is enough)
 234 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
 235            => User
 236            -> Either CorpusName [CorpusId]
 237            -> TermType Lang
 238            -> Maybe FlowSocialListWith
 239            -> (Maybe Integer, ConduitT () a m ())
 240            -> (JobLog -> m ())
 241            -> m CorpusId
 242 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
 243
 244
 245 flow :: forall env err m a c.
 246         ( FlowCmdM env err m
 247         , FlowCorpus a
 248         , MkCorpus c
 249         )
 250         => Maybe c
 251         -> User
 252         -> Either CorpusName [CorpusId]
 253         -> TermType Lang
 254         -> Maybe FlowSocialListWith
 255         -> (Maybe Integer, ConduitT () a m ())
 256         -> (JobLog -> m ())
 257         -> m CorpusId
 258 flow c u cn la mfslw (mLength, docsC) logStatus = do
 259   -- TODO if public insertMasterDocs else insertUserDocs
 260   ids <- runConduit $ zipSources (yieldMany [1..]) docsC
 261                    .| mapMC insertDoc
 262                    .| sinkList
 263 --  ids <- traverse (\(idx, doc) -> do
 264 --                      id <- insertMasterDocs c la doc
 265 --                      logStatus JobLog { _scst_succeeded = Just $ 1 + idx
 266 --                                       , _scst_failed    = Just 0
 267 --                                       , _scst_remaining = Just $ length docs - idx
 268 --                                       , _scst_events    = Just []
 269 --                                       }
 270 --                      pure id
 271 --                  ) (zip [1..] docs)
 272   flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
 273
 274   where
 275     insertDoc :: (Integer, a) -> m NodeId
 276     insertDoc (idx, doc) = do
 277       id <- insertMasterDocs c la [doc]
 278       case mLength of
 279         Nothing -> pure ()
 280         Just len -> do
 281           logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + idx
 282                            , _scst_failed    = Just 0
 283                            , _scst_remaining = Just $ fromIntegral $ len - idx
 284                            , _scst_events    = Just []
 285                            }
 286       pure $ Prelude.head id
 287
 288
 289
 290 ------------------------------------------------------------------------
 291 flowCorpusUser :: ( FlowCmdM env err m
 292                   , MkCorpus c
 293                   )
 294                => Lang
 295                -> User
 296                -> Either CorpusName [CorpusId]
 297                -> Maybe c
 298                -> [NodeId]
 299                -> Maybe FlowSocialListWith
 300                -> m CorpusId
 301 flowCorpusUser l user corpusName ctype ids mfslw = do
 302   -- User Flow
 303   (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
 304   -- NodeTexts is first
 305   _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
 306   -- printDebug "NodeTexts: " tId
 307
 308   -- NodeList is second
 309   listId <- getOrMkList userCorpusId userId
 310   -- _cooc  <- insertDefaultNode NodeListCooc listId userId
 311   -- TODO: check if present already, ignore
 312   _ <- Doc.add userCorpusId ids
 313
 314   -- printDebug "Node Text Ids:" tId
 315
 316   -- User List Flow
 317   (masterUserId, _masterRootId, masterCorpusId)
 318     <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
 319
 320   --let gp = (GroupParams l 2 3 (StopSize 3))
 321   -- Here the PosTagAlgo should be chosen according to the Lang
 322   let gp = GroupWithPosTag l CoreNLP HashMap.empty
 323   ngs    <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
 324
 325   -- printDebug "flowCorpusUser:ngs" ngs
 326
 327   _userListId <- flowList_DbRepo listId ngs
 328   _mastListId <- getOrMkList masterCorpusId masterUserId
 329   -- _ <- insertOccsUpdates userCorpusId mastListId
 330   -- printDebug "userListId" userListId
 331   -- User Graph Flow
 332   _ <- insertDefaultNodeIfNotExists NodeGraph     userCorpusId userId
 333   _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
 334   --_ <- mkPhylo  userCorpusId userId
 335   -- Annuaire Flow
 336   -- _ <- mkAnnuaire  rootUserId userId
 337   _ <- updateNgramsOccurrences userCorpusId (Just listId)
 338
 339   pure userCorpusId
 340
 341
 342 insertMasterDocs :: ( FlowCmdM env err m
 343                     , FlowCorpus a
 344                     , MkCorpus   c
 345                     )
 346                  => Maybe c
 347                  -> TermType Lang
 348                  -> [a]
 349                  -> m [DocId]
 350 insertMasterDocs c lang hs  =  do
 351   (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
 352   (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId Nothing) hs )
 353   _ <- Doc.add masterCorpusId ids'
 354   -- TODO
 355   -- create a corpus with database name (CSV or PubMed)
 356   -- add documents to the corpus (create node_node link)
 357   -- this will enable global database monitoring
 358
 359   -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
 360   mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
 361                 <- mapNodeIdNgrams
 362                 <$> documentIdWithNgrams
 363                       (extractNgramsT $ withLang lang documentsWithId)
 364                       documentsWithId
 365
 366   lId      <- getOrMkList masterCorpusId masterUserId
 367   -- _ <- saveDocNgramsWith lId mapNgramsDocs'
 368   _ <- saveDocNgramsWith lId mapNgramsDocs'
 369
 370   -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
 371   pure ids'
 372
 373 saveDocNgramsWith :: (FlowCmdM env err m)
 374                   => ListId
 375                   -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
 376                   -> m ()
 377 saveDocNgramsWith lId mapNgramsDocs' = do
 378   --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
 379   let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
 380   terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
 381
 382   let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
 383
 384   -- new
 385   mapCgramsId <- listInsertDb lId toNodeNgramsW'
 386                $ map (first _ngramsTerms . second Map.keys)
 387                $ HashMap.toList mapNgramsDocs
 388
 389   --printDebug "saveDocNgramsWith" mapCgramsId
 390   -- insertDocNgrams
 391   _return <- insertContextNodeNgrams2
 392            $ catMaybes [ ContextNodeNgrams2 <$> Just nId
 393                                             <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
 394                                             <*> Just (fromIntegral w :: Double)
 395                        | (terms'', mapNgramsTypes)      <- HashMap.toList mapNgramsDocs
 396                        , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
 397                        , (nId, (w, _cnt))                       <- Map.toList mapNodeIdWeight
 398                        ]
 399
 400   -- to be removed
 401   _   <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
 402
 403   pure ()
 404
 405
 406 ------------------------------------------------------------------------
 407 -- TODO Type NodeDocumentUnicised
 408 insertDocs :: ( FlowCmdM env err m
 409               -- , FlowCorpus a
 410               , FlowInsertDB a
 411               )
 412               => UserId
 413               -> CorpusId
 414               -> [a]
 415               -> m ([ContextId], [Indexed ContextId a])
 416 insertDocs uId cId hs = do
 417   let docs = map addUniqId hs
 418   newIds <- insertDb uId Nothing docs
 419   -- printDebug "newIds" newIds
 420   let
 421     newIds' = map reId newIds
 422     documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
 423   _ <- Doc.add cId newIds'
 424   pure (newIds', documentsWithId)
 425
 426
 427 ------------------------------------------------------------------------
 428 viewUniqId' :: UniqId a
 429             => a
 430             -> (Hash, a)
 431 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
 432       where
 433         err = panic "[ERROR] Database.Flow.toInsert"
 434
 435
 436 toInserted :: [ReturnId]
 437            -> Map Hash ReturnId
 438 toInserted =
 439   Map.fromList . map    (\r -> (reUniqId r, r)     )
 440                . filter (\r -> reInserted r == True)
 441
 442 mergeData :: Map Hash ReturnId
 443           -> Map Hash a
 444           -> [Indexed NodeId a]
 445 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
 446   where
 447     toDocumentWithId (sha,hpd) =
 448       Indexed <$> fmap reId (lookup sha rs)
 449               <*> Just hpd
 450
 451 ------------------------------------------------------------------------
 452 ------------------------------------------------------------------------
 453 ------------------------------------------------------------------------
 454 documentIdWithNgrams :: HasNodeError err
 455                      => (a
 456                      -> Cmd err (HashMap b (Map NgramsType Int, TermsCount)))
 457                      -> [Indexed NodeId a]
 458                      -> Cmd err [DocumentIdWithNgrams a b]
 459 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
 460   where
 461     toDocumentIdWithNgrams d = do
 462       e <- f $ _unIndex         d
 463       pure $ DocumentIdWithNgrams d e
 464
 465
 466 -- | TODO check optimization
 467 mapNodeIdNgrams :: (Ord b, Hashable b)
 468                 => [DocumentIdWithNgrams a b]
 469                 -> HashMap b
 470                        (Map NgramsType
 471                             (Map NodeId (Int, TermsCount))
 472                        )
 473 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith addTuples)) . fmap f
 474   where
 475     -- | NOTE We are somehow multiplying 'TermsCount' here: If the
 476     -- same ngrams term has different ngrams types, the 'TermsCount'
 477     -- for it (which is the number of times the terms appears in a
 478     -- document) is copied over to all its types.
 479     f :: DocumentIdWithNgrams a b
 480       -> HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
 481     f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> Map.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
 482       where
 483         nId = _index $ documentWithId d
 484
 485
 486 ------------------------------------------------------------------------
 487 instance ExtractNgramsT HyperdataContact
 488   where
 489     extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
 490       where
 491         extract :: TermType Lang -> HyperdataContact
 492                 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
 493         extract _l hc' = do
 494           let authors = map text2ngrams
 495                       $ maybe ["Nothing"] (\a -> [a])
 496                       $ view (hc_who . _Just . cw_lastName) hc'
 497
 498           pure $ HashMap.fromList $ [(SimpleNgrams a', (Map.singleton Authors 1, 1)) | a' <- authors ]
 499
 500
 501 instance ExtractNgramsT HyperdataDocument
 502   where
 503     extractNgramsT :: TermType Lang
 504                    -> HyperdataDocument
 505                    -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
 506     extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
 507       where
 508         extractNgramsT' :: TermType Lang
 509                         -> HyperdataDocument
 510                        -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
 511         extractNgramsT' lang' doc = do
 512           let source    = text2ngrams
 513                         $ maybe "Nothing" identity
 514                         $ _hd_source doc
 515
 516               institutes = map text2ngrams
 517                          $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
 518                          $ _hd_institutes doc
 519
 520               authors    = map text2ngrams
 521                          $ maybe ["Nothing"] (T.splitOn ", ")
 522                          $ _hd_authors doc
 523
 524           termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
 525                               <$> concat
 526                               <$> liftBase (extractTerms lang' $ hasText doc)
 527
 528           pure $ HashMap.fromList
 529                $  [(SimpleNgrams source, (Map.singleton Sources     1, 1))                    ]
 530                <> [(SimpleNgrams     i', (Map.singleton Institutes  1, 1)) | i' <- institutes ]
 531                <> [(SimpleNgrams     a', (Map.singleton Authors     1, 1)) | a' <- authors    ]
 532                <> [(EnrichedNgrams   t', (Map.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts'     ]
 533
 534 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
 535   where
 536     extractNgramsT l (Node { _node_hyperdata = h }) = extractNgramsT l h
 537
 538 instance HasText a => HasText (Node a)
 539   where
 540     hasText (Node { _node_hyperdata = h }) = hasText h
 541
 542
 543
 544 -- | TODO putelsewhere
 545 -- | Upgrade function
 546 -- Suppose all documents are English (this is the case actually)
 547 indexAllDocumentsWithPosTag :: FlowCmdM env err m
 548                             => m ()
 549 indexAllDocumentsWithPosTag = do
 550   rootId    <- getRootId (UserName userMaster)
 551   corpusIds <- findNodesId rootId [NodeCorpus]
 552   docs      <- List.concat <$> mapM getDocumentsWithParentId corpusIds
 553   _ <- mapM extractInsert (splitEvery 1000 docs)
 554   pure ()
 555
 556 extractInsert :: FlowCmdM env err m
 557               => [Node HyperdataDocument] -> m ()
 558 extractInsert docs = do
 559   let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
 560   mapNgramsDocs' <- mapNodeIdNgrams
 561                 <$> documentIdWithNgrams
 562                     (extractNgramsT $ withLang (Multi EN) documentsWithId)
 563                     documentsWithId
 564   _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 565   pure ()