src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers.FrameWrite
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 module Gargantext.Core.Text.Corpus.Parsers.FrameWrite
  13   where
  14
  15 import Control.Applicative ((*>))
  16 import Control.Monad (void)
  17 import Data.Either
  18 import Data.Maybe
  19 import Data.Text hiding (foldl)
  20 import Gargantext.Core.Text (sentences)
  21 import Gargantext.Prelude
  22 import Prelude ((++), read)
  23 import Text.Parsec hiding (Line)
  24 import Text.Parsec.String
  25 import qualified Data.Text as DT
  26 import qualified Data.List as List
  27
  28
  29 -- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
  30
  31 -- Authors : default : anonymous ; except if the following line is encountered
  32 -- ^authors: FirstName1, LastName1 ; FirstName2, LastName2 ; etc.
  33 -- date : default : date of last change except if the following line is encountered  ^@@date: 2021-09-10
  34 -- source: Name of the root node except if the following line is encountered ^@@source:
  35 -- By default, 1 framawrite node = 1 document.  Option for further developments: allow to give a level at generation for the split within framawrite node : :
  36 --
  37 -- par défaut: un doc == 1 NodeWrite
  38 -- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
  39
  40 sample :: Text
  41 sample =
  42   unlines
  43     [ "title1"
  44 --    , "title2"
  45 --    , "=="
  46 --     , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
  47     , "date: 2021-09-10"
  48     , "source: someSource"
  49     , "document contents 1"
  50     , "document contents 2"
  51     ]
  52
  53 sampleUnordered :: Text
  54 sampleUnordered =
  55   unlines
  56     [ "title1"
  57     , "title2"
  58     , "=="
  59     , "document contents 1"
  60     , "date: 2021-09-10"
  61     , "authors: FirstName1, LastName1; FirstName2, LastName2"
  62     , "source: someSource"
  63     , "document contents 2"
  64     ]
  65
  66 -- parseSample = parse documentP "sample" (unpack sample)
  67 -- parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
  68 parseLinesSample :: Either ParseError Parsed
  69 parseLinesSample = parseLines sample
  70 parseLinesSampleUnordered :: Either ParseError Parsed
  71 parseLinesSampleUnordered = parseLines sampleUnordered
  72
  73 data Author =
  74     Author { firstName :: Text
  75            , lastName  :: Text }
  76     deriving (Show)
  77
  78 data Parsed =
  79   Parsed { title    :: Text
  80          , authors  :: [Author]
  81          , date     :: Maybe Date
  82          , source   :: Maybe Text
  83          , contents :: Text }
  84   deriving (Show)
  85
  86 emptyParsed :: Parsed
  87 emptyParsed =
  88   Parsed { title = ""
  89          , authors = []
  90          , date = Nothing
  91          , source = Nothing
  92          , contents = "" }
  93
  94 data Date =
  95   Date { year  :: Integer
  96        , month :: Integer
  97        , day   :: Integer
  98        }
  99   deriving (Show)
 100
 101 data Line =
 102     LAuthors [Author]
 103   | LContents Text
 104   | LDate Date
 105   | LSource Text
 106   | LTitle Text
 107   deriving (Show)
 108
 109 parseLines :: Text -> Either ParseError Parsed
 110 parseLines text = foldl f emptyParsed <$> lst
 111   where
 112     lst = parse documentLines "" (unpack text)
 113     f (Parsed { .. }) (LAuthors as) = Parsed { authors  = as                      , .. }
 114     f (Parsed { .. }) (LContents c) = Parsed { contents = DT.unlines [contents, c], .. }
 115     f (Parsed { .. }) (LDate    d ) = Parsed { date     = Just d                  , .. }
 116     f (Parsed { .. }) (LSource  s ) = Parsed { source   = Just s                  , .. }
 117     f (Parsed { .. }) (LTitle   t ) = Parsed { title    = t                       , .. }
 118
 119 -- Source should be the name of the node
 120 -- First line of each Context should be the title.
 121 documentLinesP :: Parser [Line]
 122 documentLinesP = do
 123   t  <- titleP
 124   ls <- lineP `sepBy` newline
 125   pure $ [LTitle $ pack t] ++ ls
 126
 127 documentLines :: Parser [Line]
 128 documentLines = do
 129   ls <- lineP `sepBy` newline
 130   pure ls
 131
 132 lineP :: Parser Line
 133 lineP = do
 134   choice [ try authorsLineP
 135          , try dateLineP
 136          , try sourceLineP
 137          , contentsLineP
 138          ]
 139
 140 authorsLineP :: Parser Line
 141 authorsLineP = do
 142   authors <- authorsP
 143   pure $ LAuthors authors
 144
 145 dateLineP :: Parser Line
 146 dateLineP = do
 147   date <- dateP
 148   pure $ LDate date
 149
 150 sourceLineP :: Parser Line
 151 sourceLineP = do
 152   source <- sourceP
 153   pure $ LSource $ pack source
 154
 155 contentsLineP :: Parser Line
 156 contentsLineP = do
 157   contents <- many (noneOf "\n")
 158   pure $ LContents $ pack contents
 159
 160 --------------------
 161
 162 -- documentP = do
 163 --   t <- titleP
 164 --   a <- optionMaybe authorsP
 165 --   d <- optionMaybe dateP
 166 --   s <- optionMaybe sourceP
 167 --   c <- contentsP
 168 --   pure $ Parsed { title = pack t
 169 --                 , authors = fromMaybe [] a
 170 --                 , date = pack <$> d
 171 --                 , source = pack <$> s
 172 --                 , contents = pack c }
 173
 174 titleDelimiterP :: Parser ()
 175 titleDelimiterP = do
 176   _ <- newline
 177   -- _ <- try (string "==")
 178   pure ()
 179
 180 titleP :: Parser [Char]
 181 titleP = manyTill anyChar (try titleDelimiterP)
 182
 183 authorsPrefixP :: Parser [Char]
 184 authorsPrefixP = do
 185   _ <- string "authors:"
 186   many (char ' ')
 187 authorsP :: Parser [Author]
 188 authorsP = try authorsPrefixP *> sepBy authorP (char ';')
 189 authorP :: Parser Author
 190 authorP = do
 191   fn <- manyTill anyChar (char ',')
 192   _ <- many (char ' ')
 193   --ln <- manyTill anyChar (void (char ';') <|> tokenEnd)
 194   --ln <- manyTill anyChar (tokenEnd)
 195   ln <- many (noneOf "\n")
 196   pure $ Author { firstName = pack fn, lastName = pack ln }
 197   -- manyTill anyChar (void (char '\n') <|> eof)
 198
 199 datePrefixP :: Parser [Char]
 200 datePrefixP = do
 201   _ <- string "date:"
 202   many (char ' ')
 203 dateP :: Parser Date
 204 dateP = try datePrefixP
 205          *> dateISOP
 206         --  *> many (noneOf "\n")
 207
 208 dateISOP :: Parser Date
 209 dateISOP = do
 210   year <- rd <$> number
 211   _ <- char '-'
 212   month <- rd <$> number
 213   _ <- char '-'
 214   day <- rd <$> number
 215   _ <- many (noneOf "\n" )
 216   pure $ Date { year, month, day }
 217   where
 218     rd = read :: [Char] -> Integer
 219     number = many1 digit
 220
 221 sourceP :: Parser [Char]
 222 sourceP = try sourcePrefixP
 223           *> many (noneOf "\n")
 224   where
 225     sourcePrefixP :: Parser [Char]
 226     sourcePrefixP = do
 227       _ <- string "source:"
 228       many (char ' ')
 229
 230 -- contentsP :: Parser String
 231 -- contentsP = many anyChar
 232
 233 tokenEnd :: Parser ()
 234 tokenEnd = void (char '\n') <|> eof
 235
 236 --- MISC Tools
 237 -- Using ChunkAlong here enable redundancies in short corpora of texts
 238 -- maybe use splitEvery or chunkAlong depending on the size of the whole text
 239 text2titleParagraphs :: Int -> Text -> [(Text, Text)]
 240 text2titleParagraphs n = catMaybes
 241                        . List.map doTitle
 242                        . (chunkAlong n' n)
 243                        -- . (splitEvery n)
 244                        . sentences
 245                        . DT.intercalate " " -- ". "
 246                        . List.filter (/= "")
 247                        . DT.lines
 248   where
 249     n' = n + (round $ (fromIntegral n) / (2 :: Double))
 250
 251 doTitle :: [Text] -> Maybe (Text, Text)
 252 doTitle (t:ts) = Just (t, DT.intercalate " " ts)
 253 doTitle [] = Nothing
 254
 255
 256 clean :: Text -> Text
 257 clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
 258