]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/Main.hs
[CLI FIX] for now better perf with mapConcurrently only.
[gargantext.git] / bin / gargantext-cli / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Main specifications to index a corpus with a term list
11
12 -}
13
14 {-# LANGUAGE DataKinds #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE StandaloneDeriving #-}
20 {-# LANGUAGE TypeOperators #-}
21 {-# LANGUAGE Strict #-}
22
23 module Main where
24
25 import qualified Data.Vector as DV
26 import qualified Data.Maybe as DMaybe
27
28 import Control.Monad (zipWithM)
29 import Control.Monad.IO.Class
30
31 import qualified Data.IntMap as DM
32
33 import Data.Map (Map)
34 import Data.Text (Text)
35 import Data.List (cycle, concat, unwords)
36 import Data.List.Split (chunksOf)
37 import System.IO (hPutStr, hFlush, stderr)
38 import System.Environment
39 import Control.Concurrent.Async as CCA (mapConcurrently)
40 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
41 import Prelude ((>>))
42
43 import Gargantext.Prelude
44 import Gargantext.Core
45 import Gargantext.Core.Types
46 import Gargantext.Text.Terms
47 import Gargantext.Text.Terms.WithList
48 import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
49 import Gargantext.Text.List.CSV (csvGraphTermList)
50 import Gargantext.Text.Terms (terms)
51 import Gargantext.Text.Metrics.Count (coocOn, Coocs)
52
53 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
54 mapMP f xs = do
55 bs <- zipWithM g (cycle "-\\|/") xs
56 liftIO $ hPutStr stderr "\rDone\n"
57 pure bs
58 where
59 g c x = do
60 liftIO $ hPutStr stderr ['\r',c]
61 liftIO $ hFlush stderr
62 f x
63
64 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
65 mapConcurrentlyChunked f ts = do
66 caps <- getNumCapabilities
67 let n = 1 `max` (length ts `div` caps)
68 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
69
70 filterTermsAndCooc
71 :: TermType Lang
72 -> (Int, [Text])
73 -> IO (Map (Terms, Terms) Coocs)
74 filterTermsAndCooc patterns (year, ts) = do
75 log "start"
76 r <- coocOn identity <$> mapM (\x -> {-log "work" >>-} terms patterns x) ts
77 log "stop"
78 pure r
79 where
80 log m = do
81 tid <- myThreadId
82 (p, _) <- threadCapability tid
83 putStrLn . unwords $
84 ["filterTermsAndCooc:", m, show year, "on proc", show p]
85
86 --main :: IO [()]
87 main = do
88 [corpusFile, termListFile, _] <- getArgs
89
90 --corpus :: IO (DM.IntMap [[Text]])
91 corpus <- DM.fromListWith (<>)
92 . DV.toList
93 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
94 . snd
95 <$> readCsv corpusFile
96
97 -- termListMap :: [Text]
98 termList <- csvGraphTermList termListFile
99
100 putStrLn $ show $ length termList
101
102 let patterns = WithList $ buildPatterns termList
103
104 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
105 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
106 putStrLn $ show r
107 --writeFile outputFile cooc