3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Main specifications to index a corpus with a term list
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
25 import qualified Data.Vector as DV
27 import Control.Monad (zipWithM)
28 import Control.Monad.IO.Class
29 import Data.Text (Text)
30 import Data.List (cycle)
31 import System.IO (hPutStr, hFlush, stderr)
32 import System.Environment
33 --import Control.Concurrent.Async as CCA (mapConcurrently)
35 import Gargantext.Prelude
36 import Gargantext.Text.Context
37 import Gargantext.Text.Terms
38 import Gargantext.Text.Terms.WithList
39 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract)
40 import Gargantext.Text.List.CSV (csvGraphTermList)
41 import Gargantext.Text.Terms (terms)
42 import Gargantext.Text.Metrics.Count (cooc)
44 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
46 bs <- zipWithM g (cycle "-\\|/") xs
47 liftIO $ hPutStr stderr "\rDone\n"
51 liftIO $ hPutStr stderr ['\r',c]
52 liftIO $ hFlush stderr
57 [corpusFile, termListFile, outputFile] <- getArgs
60 corpus <- DV.toList <$> map (\n -> (csv_title n) <> " " <> (csv_abstract n))
62 <$> readCsv corpusFile
64 putStrLn $ show $ length corpus
65 -- termListMap :: [Text]
66 termList <- csvGraphTermList termListFile
68 putStrLn $ show $ length termList
70 let patterns = WithList $ buildPatterns termList
71 corpusIndexed <- mapMP (terms patterns) corpus
72 mapM (putStrLn . show) corpusIndexed
73 let myCooc = cooc corpusIndexed
75 putStrLn $ show myCooc
76 --writeFile outputFile cooc