src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs

   1 module Gargantext.Core.Text.Corpus.Parsers.FrameWrite where
   2
   3 import Control.Applicative ((*>))
   4 import Control.Monad (void)
   5 import Data.Maybe
   6 import Data.Text
   7 import Gargantext.Prelude
   8 import Prelude (String, (++))
   9 import Text.Parsec hiding (Line)
  10 import Text.Parsec.Combinator
  11 import Text.Parsec.String
  12
  13
  14 -- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
  15
  16 -- title : everything above the first ==
  17 -- Authors : default : anonymous ; except if the following line is encountered ^@@authors: FirstName1, LastName1 ; FirstName2, LastName2 ; etc.
  18 -- date : default : date of last change except if the following line is encountered  ^@@date: 2021-09-10
  19 -- source: Name of the root node except if the following line is encountered ^@@source:
  20 -- By default, 1 framawrite node = 1 document.  Option for further developments: allow to give a level at generation for the split within framawrite node : :
  21 --
  22 -- par défaut: un doc == 1 NodeWrite
  23 -- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
  24
  25 sample =
  26   unlines
  27     [ "title1"
  28     , "title2"
  29     , "=="
  30     , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
  31     , "^@@date: 2021-09-10"
  32     , "^@@source: someSource"
  33     , "document contents 1"
  34     , "document contents 2"
  35     ]
  36
  37 sampleUnordered =
  38   unlines
  39     [ "title1"
  40     , "title2"
  41     , "=="
  42     , "document contents 1"
  43     , "^@@date: 2021-09-10"
  44     , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
  45     , "^@@source: someSource"
  46     , "document contents 2"
  47     ]
  48
  49 parseSample = parse documentP "sample" (unpack sample)
  50 parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
  51 parseLinesSample = parse documentLinesP "sample" (unpack sample)
  52 parseLinesSampleUnordered = parse documentLinesP "sampleUnordered" (unpack sampleUnordered)
  53
  54 data Author =
  55     Author { firstName :: Text
  56            , lastName :: Text }
  57     deriving (Show)
  58
  59 data Parsed =
  60   Parsed { title :: Text
  61          , authors :: [Author]
  62          , date :: Maybe Text
  63          , source :: Maybe Text
  64          , contents :: Text }
  65   deriving (Show)
  66
  67 emptyParsed =
  68   Parsed { title = ""
  69          , authors = []
  70          , date = Nothing
  71          , source = Nothing
  72          , contents = "" }
  73
  74 data Line =
  75     LAuthors [Author]
  76   | LDate Text
  77   | LSource Text
  78   | LContents Text
  79   | LTitle Text
  80   deriving (Show)
  81
  82 parseLines :: Text -> Parsed
  83 parseLines text = foldl f emptyParsed lst
  84   where
  85     lst = parse documentLinesP "" (unpack text)
  86     f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. }
  87     f (Parsed { .. }) (LDate    d ) = Parsed { date = d, .. }
  88     f (Parsed { .. }) (LSource  s ) = Parsed { source = s, .. }
  89     f (Parsed { .. }) (LContents c) = Parsed { contents = contents ++ c, .. }
  90     f (Parsed { .. }) (LTitle   t ) = Parsed { title = t, .. }
  91
  92 documentLinesP = do
  93   t <- titleP
  94   lines <- lineP `sepBy` newline
  95   pure $ [LTitle $ pack t] ++ lines
  96
  97 lineP :: Parser Line
  98 lineP = do
  99   choice [ try authorsLineP
 100          , try dateLineP
 101          , try sourceLineP
 102          , contentsLineP ]
 103
 104 authorsLineP = do
 105   authors <- authorsP
 106   pure $ LAuthors authors
 107
 108 dateLineP = do
 109   date <- dateP
 110   pure $ LDate $ pack date
 111
 112 sourceLineP = do
 113   source <- sourceP
 114   pure $ LSource $ pack source
 115
 116 contentsLineP = do
 117   contents <- many (noneOf "\n")
 118   pure $ LContents $ pack contents
 119
 120 --------------------
 121
 122 documentP = do
 123   t <- titleP
 124   a <- optionMaybe authorsP
 125   d <- optionMaybe dateP
 126   s <- optionMaybe sourceP
 127   c <- contentsP
 128   pure $ Parsed { title = pack t
 129                 , authors = fromMaybe [] a
 130                 , date = pack <$> d
 131                 , source = pack <$> s
 132                 , contents = pack c }
 133
 134 titleDelimiterP = do
 135   newline
 136   string "=="
 137   tokenEnd
 138 titleP :: Parser [Char]
 139 titleP = manyTill anyChar (try titleDelimiterP)
 140
 141 authorsPrefixP = do
 142   _ <- string "^@@authors:"
 143   many (char ' ')
 144 authorsP :: Parser [Author]
 145 authorsP = try authorsPrefixP *> sepBy authorP (char ';')
 146 authorP :: Parser Author
 147 authorP = do
 148   fn <- manyTill anyChar (char ',')
 149   _ <- many (char ' ')
 150   --ln <- manyTill anyChar (void (char ';') <|> tokenEnd)
 151   --ln <- manyTill anyChar (tokenEnd)
 152   ln <- many (noneOf "\n")
 153   pure $ Author { firstName = pack fn, lastName = pack ln }
 154   -- manyTill anyChar (void (char '\n') <|> eof)
 155
 156 datePrefixP = do
 157   _ <- string "^@@date:"
 158   many (char ' ')
 159 dateP :: Parser [Char]
 160 dateP = try datePrefixP
 161         *> many (noneOf "\n")
 162
 163 sourcePrefixP = do
 164   _ <- string "^@@source:"
 165   many (char ' ')
 166 sourceP :: Parser [Char]
 167 sourceP = try sourcePrefixP
 168           *> many (noneOf "\n")
 169
 170 contentsP :: Parser String
 171 contentsP = many anyChar
 172
 173 tokenEnd = void (char '\n') <|> eof