3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Main specifications to index a corpus with a term list
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
25 import qualified Data.Vector as DV
26 import qualified Data.Maybe as DMaybe
28 import Control.Monad (zipWithM)
29 import Control.Monad.IO.Class
31 import qualified Data.IntMap as DM
34 import Data.Text (Text)
35 import Data.List (cycle)
36 import System.IO (hPutStr, hFlush, stderr)
37 import System.Environment
38 import Control.Concurrent.Async as CCA (mapConcurrently)
40 import Gargantext.Prelude
41 import Gargantext.Core
42 import Gargantext.Core.Types
43 import Gargantext.Text.Terms
44 import Gargantext.Text.Terms.WithList
45 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
46 import Gargantext.Text.List.CSV (csvGraphTermList)
47 import Gargantext.Text.Terms (terms)
48 import Gargantext.Text.Metrics.Count (coocOn, Coocs)
50 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
52 bs <- zipWithM g (cycle "-\\|/") xs
53 liftIO $ hPutStr stderr "\rDone\n"
57 liftIO $ hPutStr stderr ['\r',c]
58 liftIO $ hFlush stderr
67 -> IO (Map (Terms, Terms) Coocs)
68 filterTermsAndCooc patterns (year, ts) = do
69 putStrLn $ "start filterTermsAndCooc " <> show year
70 r <- coocOn identity <$> mapM (terms patterns) ts
71 putStrLn $ "stop filterTermsAndCooc " <> show year
76 [corpusFile, termListFile, _] <- getArgs
78 --corpus :: IO (DM.IntMap [[Text]])
79 corpus <- DM.fromListWith (<>)
81 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
83 <$> readCsv corpusFile
85 -- termListMap :: [Text]
86 termList <- csvGraphTermList termListFile
88 putStrLn $ show $ length termList
90 let years = DM.keys corpus
91 let patterns = WithList $ buildPatterns termList
92 let corpus' = DMaybe.catMaybes $ map (\k -> DM.lookup k corpus) years
95 r <- mapConcurrently (filterTermsAndCooc patterns) (zip years corpus')
97 --writeFile outputFile cooc