]> Git — Sourcephile - gargantext.git/blob - bin/gargantext-cli/Main.hs
[ngrams] add score update endpoint + sorting
[gargantext.git] / bin / gargantext-cli / Main.hs
1 {-|
2 Module : Main.hs
3 Description : Gargantext starter
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Main specifications to index a corpus with a term list
11
12 -}
13
14 {-# LANGUAGE StandaloneDeriving #-}
15 {-# LANGUAGE TypeOperators #-}
16 {-# LANGUAGE Strict #-}
17
18 module Main where
19
20 import Data.ByteString.Lazy (writeFile)
21
22 import Data.Text (pack)
23 import qualified Data.Text as DT
24
25 import Data.Tuple.Extra (both)
26 import qualified Data.Vector as DV
27
28 import Control.Monad (zipWithM)
29 import Control.Monad.IO.Class
30
31 import Data.Map (Map)
32 import qualified Data.Map as DM
33
34 import GHC.Generics
35 import Data.Aeson
36
37 import Data.Text (Text)
38 import Data.List (cycle, concat, unwords)
39 import Data.List.Split (chunksOf)
40 import System.IO (hPutStr, hFlush, stderr)
41 import System.Environment
42 import Control.Concurrent.Async as CCA (mapConcurrently)
43 import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
44
45 import Gargantext.Prelude
46 import Gargantext.Core
47 import Gargantext.Core.Types
48 import Gargantext.Core.Text.Terms
49 import Gargantext.Core.Text.Context
50 import Gargantext.Core.Text.Terms.WithList
51 import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
52 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
53 import Gargantext.Core.Text.Terms (terms)
54 import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
55
56 ------------------------------------------------------------------------
57 -- OUTPUT format
58
59 data CoocByYear = CoocByYear { year :: Int
60 , nbContexts :: NbContexts
61 , coocurrences :: Map (Text, Text) Coocs
62 } deriving (Show, Generic)
63
64 data CoocByYears = CoocByYears { years :: [CoocByYear] }
65 deriving (Show, Generic)
66
67 type NbContexts = Int
68
69 instance ToJSON CoocByYear
70 instance ToJSON CoocByYears
71 ------------------------------------------------------------------------
72
73 filterTermsAndCooc
74 :: Patterns
75 -> (Int, [Text])
76 -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
77 filterTermsAndCooc patterns (year, ts) = do
78 log "start"
79 r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
80 log "stop"
81 pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
82 where
83
84 log m = do
85 tid <- myThreadId
86 (p, _) <- threadCapability tid
87 putStrLn . unwords $
88 ["filterTermsAndCooc:", m, show year, "on proc", show p]
89
90 main :: IO ()
91 main = do
92 [corpusFile, termListFile, outputFile] <- getArgs
93
94 --corpus :: IO (DM.IntMap [[Text]])
95 corpus <- DM.fromListWith (<>)
96 . DV.toList
97 . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
98 . snd
99 <$> readFile corpusFile
100
101 -- termListMap :: [Text]
102 termList <- csvMapTermList termListFile
103
104 putStrLn $ show $ length termList
105
106 let patterns = buildPatterns termList
107
108 -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
109 r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
110 writeFile outputFile $ encode (CoocByYears r)
111
112
113
114 ------------------------------------------------------------------------
115 -- | Tools
116 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
117 mapMP f xs = do
118 bs <- zipWithM g (cycle "-\\|/") xs
119 liftIO $ hPutStr stderr "\rDone\n"
120 pure bs
121 where
122 g c x = do
123 liftIO $ hPutStr stderr ['\r',c]
124 liftIO $ hFlush stderr
125 f x
126
127 -- | Optimi that need further developments (not used yet)
128 mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
129 mapConcurrentlyChunked f ts = do
130 caps <- getNumCapabilities
131 let n = 1 `max` (length ts `div` caps)
132 concat <$> mapConcurrently (mapM f) (chunksOf n ts)
133
134
135 --terms' :: Patterns -> Text -> Corpus [[Text]]
136 terms' pats txt = pure $ concat $ extractTermsWithList pats txt
137
138
139 -- | TODO Minimal Example
140 --testCooc = do
141 -- let patterns = buildPatterns testTermList
142 -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
143 -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
144
145
146 testCorpus :: [(Int, [Text])]
147 testCorpus = [ (1998, [pack "The beees"])
148 , (1999, [ pack "The bees and the flowers"
149 --, pack "The bees and the flowers"
150 ])
151 ]
152
153 testTermList :: TermList
154 testTermList = [ ([pack "bee"], [[pack "bees"]])
155 , ([pack "flower"], [[pack "flowers"]])
156 ]
157