]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/Main.hs
first commit
[gargantext.git] / bin / gargantext-cli / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Main specifications to index a corpus with a term list
11
12 -}
13
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
17
18 module Main where
19
20 import Data.ByteString.Lazy (writeFile)
21
22 import Data.Maybe (catMaybes)
23 import Data.Text (pack)
24 import qualified Data.Text as DT
25
26 import Data.Tuple.Extra (both)
27 import qualified Data.Vector as DV
28 import qualified Data.Maybe as DMaybe
29
30 import Control.Monad (zipWithM)
31 import Control.Monad.IO.Class
32
33 import Data.Map (Map)
34 import qualified Data.IntMap as DIM
35 import qualified Data.Map as DM
36
37 import GHC.Generics
38 import Data.Aeson
39
40 import Data.Text (Text)
41 import Data.List (cycle, concat, unwords)
42 import Data.List.Split (chunksOf)
43 import System.IO (hPutStr, hFlush, stderr)
44 import System.Environment
45 import Control.Concurrent.Async as CCA (mapConcurrently)
46 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
47 import Prelude ((>>))
48
49 import Gargantext.Prelude
50 import Gargantext.Core
51 import Gargantext.Core.Types
52 import Gargantext.Core.Text.Terms
53 import Gargantext.Core.Text.Context
54 import Gargantext.Core.Text.Terms.WithList
55 import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
56 import Gargantext.Core.Text.List.CSV (csvMapTermList)
57 import Gargantext.Core.Text.Terms (terms)
58 import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
59
60 ------------------------------------------------------------------------
61 -- OUTPUT format
62
63 data CoocByYear = CoocByYear { year :: Int
64 , nbContexts :: NbContexts
65 , coocurrences :: Map (Text, Text) Coocs
66 } deriving (Show, Generic)
67
68 data CoocByYears = CoocByYears { years :: [CoocByYear] }
69 deriving (Show, Generic)
70
71 type NbContexts = Int
72
73 instance ToJSON CoocByYear
74 instance ToJSON CoocByYears
75 ------------------------------------------------------------------------
76
77 filterTermsAndCooc
78 :: Patterns
79 -> (Int, [Text])
80 -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
81 filterTermsAndCooc patterns (year, ts) = do
82 log "start"
83 r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
84 log "stop"
85 pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
86 where
87
88 log m = do
89 tid <- myThreadId
90 (p, _) <- threadCapability tid
91 putStrLn . unwords $
92 ["filterTermsAndCooc:", m, show year, "on proc", show p]
93
94 main :: IO ()
95 main = do
96 [corpusFile, termListFile, outputFile] <- getArgs
97
98 --corpus :: IO (DM.IntMap [[Text]])
99 corpus <- DM.fromListWith (<>)
100 . DV.toList
101 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
102 . snd
103 <$> readFile corpusFile
104
105 -- termListMap :: [Text]
106 termList <- csvMapTermList termListFile
107
108 putStrLn $ show $ length termList
109
110 let patterns = buildPatterns termList
111
112 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
113 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
114 writeFile outputFile $ encode (CoocByYears r)
115
116
117
118 ------------------------------------------------------------------------
119 -- | Tools
120 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
121 mapMP f xs = do
122 bs <- zipWithM g (cycle "-\\|/") xs
123 liftIO $ hPutStr stderr "\rDone\n"
124 pure bs
125 where
126 g c x = do
127 liftIO $ hPutStr stderr ['\r',c]
128 liftIO $ hFlush stderr
129 f x
130
131 -- | Optimi that need further developments (not used yet)
132 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
133 mapConcurrentlyChunked f ts = do
134 caps <- getNumCapabilities
135 let n = 1 `max` (length ts `div` caps)
136 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
137
138
139 --terms' :: Patterns -> Text -> Corpus [[Text]]
140 terms' pats txt = pure $ concat $ extractTermsWithList pats txt
141
142
143 -- | TODO Minimal Example
144 --testCooc = do
145 -- let patterns = buildPatterns testTermList
146 -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
147 -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
148
149
150 testCorpus :: [(Int, [Text])]
151 testCorpus = [ (1998, [pack "The beees"])
152 , (1999, [ pack "The bees and the flowers"
153 --, pack "The bees and the flowers"
154 ])
155 ]
156
157 testTermList :: TermList
158 testTermList = [ ([pack "bee"], [[pack "bees"]])
159 , ([pack "flower"], [[pack "flowers"]])
160 ]
161