3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Main specifications to index a corpus with a term list
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
25 import qualified Data.Vector as DV
26 import qualified Data.Maybe as DMaybe
28 import Control.Monad (zipWithM)
29 import Control.Monad.IO.Class
31 import qualified Data.IntMap as DM
34 import Data.Text (Text)
35 import Data.List (cycle, concat, unwords)
36 import Data.List.Split (chunksOf)
37 import System.IO (hPutStr, hFlush, stderr)
38 import System.Environment
39 import Control.Concurrent.Async as CCA (mapConcurrently)
40 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
43 import Gargantext.Prelude
44 import Gargantext.Core
45 import Gargantext.Core.Types
46 import Gargantext.Text.Terms
47 import Gargantext.Text.Terms.WithList
48 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
49 import Gargantext.Text.List.CSV (csvGraphTermList)
50 import Gargantext.Text.Terms (terms)
51 import Gargantext.Text.Metrics.Count (coocOn, Coocs)
53 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
55 bs <- zipWithM g (cycle "-\\|/") xs
56 liftIO $ hPutStr stderr "\rDone\n"
60 liftIO $ hPutStr stderr ['\r',c]
61 liftIO $ hFlush stderr
64 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
65 mapConcurrentlyChunked f ts = do
66 caps <- getNumCapabilities
67 let n = 1 `max` (length ts `div` caps)
68 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
73 -> IO (Map (Terms, Terms) Coocs)
74 filterTermsAndCooc patterns (year, ts) = do
76 r <- coocOn identity <$> mapM (\x -> {-log "work" >>-} terms patterns x) ts
82 (p, _) <- threadCapability tid
84 ["filterTermsAndCooc:", m, show year, "on proc", show p]
88 [corpusFile, termListFile, _] <- getArgs
90 --corpus :: IO (DM.IntMap [[Text]])
91 corpus <- DM.fromListWith (<>)
93 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
95 <$> readCsv corpusFile
97 -- termListMap :: [Text]
98 termList <- csvGraphTermList termListFile
100 putStrLn $ show $ length termList
102 let patterns = WithList $ buildPatterns termList
104 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
105 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
107 --writeFile outputFile cooc