-}
{-# LANGUAGE StandaloneDeriving #-}
-{-# LANGUAGE TypeOperators #-}
+{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module Main where
-import Data.ByteString.Lazy (writeFile)
-
-import Data.Maybe (catMaybes)
-import Data.Text (pack)
-import qualified Data.Text as DT
-
-import Data.Tuple.Extra (both)
-import qualified Data.Vector as DV
-import qualified Data.Maybe as DMaybe
-
+import Control.Concurrent.Async as CCA (mapConcurrently)
+import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Control.Monad (zipWithM)
import Control.Monad.IO.Class
-
-import Data.Map (Map)
-import qualified Data.IntMap as DIM
-import qualified Data.Map as DM
-
-import GHC.Generics
import Data.Aeson
-
-import Data.Text (Text)
+import Data.ByteString.Lazy (writeFile)
+import Data.Either (Either(..))
import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf)
+import Data.Map.Strict (Map)
+import qualified Data.Map.Strict as DM
+import Data.Text (pack, Text)
+import qualified Data.Text as DT
+import Data.Tuple.Extra (both)
+import qualified Data.Vector as DV
+import GHC.Generics
import System.IO (hPutStr, hFlush, stderr)
import System.Environment
-import Control.Concurrent.Async as CCA (mapConcurrently)
-import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
-import Prelude ((>>))
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.WithList
-import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
+import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms (terms)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
[corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]])
- corpus <- DM.fromListWith (<>)
- . DV.toList
- . DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
- . snd
- <$> readFile corpusFile
+ eCorpusFile <- readCSVFile corpusFile
+ case eCorpusFile of
+ Right cf -> do
+ let corpus = DM.fromListWith (<>)
+ . DV.toList
+ . DV.map (\n -> (fromMIntOrDec defaultYear $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
+ . snd $ cf
- -- termListMap :: [Text]
- termList <- csvMapTermList termListFile
+ -- termListMap :: [Text]
+ termList <- csvMapTermList termListFile
- putStrLn $ show $ length termList
+ putStrLn $ show $ length termList
- let patterns = buildPatterns termList
+ let patterns = buildPatterns termList
- -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
- r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
- writeFile outputFile $ encode (CoocByYears r)
+ -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
+ r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
+ writeFile outputFile $ encode (CoocByYears r)
+ Left e -> panic $ "Error: " <> (pack e)
testCorpus :: [(Int, [Text])]
testCorpus = [ (1998, [pack "The beees"])
- , (1999, [ pack "The bees and the flowers"
- --, pack "The bees and the flowers"
+ , (1999, [ pack "The bees and the flowers"
+ --, pack "The bees and the flowers"
])
]
testTermList = [ ([pack "bee"], [[pack "bees"]])
, ([pack "flower"], [[pack "flowers"]])
]
-