parser.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. /*
  2. * MIT License
  3. *
  4. * Copyright (c) 2020 Alexey Edelev <semlanik@gmail.com>
  5. *
  6. * This file is part of gostfix project https://git.semlanik.org/semlanik/gostfix
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy of this
  9. * software and associated documentation files (the "Software"), to deal in the Software
  10. * without restriction, including without limitation the rights to use, copy, modify,
  11. * merge, publish, distribute, sublicense, and/or sell copies of the Software, and
  12. * to permit persons to whom the Software is furnished to do so, subject to the following
  13. * conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all copies
  16. * or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  19. * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
  20. * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
  21. * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  22. * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23. * DEALINGS IN THE SOFTWARE.
  24. */
  25. package scanner
  26. import (
  27. "bufio"
  28. "bytes"
  29. "encoding/hex"
  30. "errors"
  31. "fmt"
  32. "log"
  33. "os"
  34. "strings"
  35. "time"
  36. "git.semlanik.org/semlanik/gostfix/common"
  37. "git.semlanik.org/semlanik/gostfix/config"
  38. utils "git.semlanik.org/semlanik/gostfix/utils"
  39. "github.com/google/uuid"
  40. enmime "github.com/jhillyerd/enmime"
  41. )
  42. const (
  43. StateHeaderScan = iota
  44. StateBodyScan
  45. )
  46. const (
  47. AtLeastOneHeaderMask = 1 << iota
  48. FromHeaderMask
  49. DateHeaderMask
  50. ToHeaderMask
  51. AllHeaderMask = 15
  52. )
  53. type parseData struct {
  54. state int
  55. mandatoryHeaders int
  56. previousHeader *string
  57. email *common.Mail
  58. bodyContentType string
  59. bodyData string
  60. activeBoundary string
  61. }
  62. func (pd *parseData) reset() {
  63. *pd = parseData{
  64. state: StateHeaderScan,
  65. previousHeader: nil,
  66. mandatoryHeaders: 0,
  67. email: NewEmail(),
  68. bodyContentType: "plain/text",
  69. bodyData: "",
  70. activeBoundary: "",
  71. }
  72. }
  73. func parseFile(file *utils.LockedFile) []*common.Mail {
  74. log.Println("Parse file")
  75. defer log.Println("Exit parse")
  76. var emails []*common.Mail
  77. pd := &parseData{}
  78. pd.reset()
  79. scanner := bufio.NewScanner(file)
  80. for scanner.Scan() {
  81. log.Println("Scan next line")
  82. currentText := scanner.Text()
  83. if utils.RegExpUtilsInstance().MailIndicator.MatchString(currentText) {
  84. if pd.mandatoryHeaders == AllHeaderMask {
  85. pd.parseBody()
  86. emails = append(emails, pd.email)
  87. }
  88. pd.reset()
  89. fmt.Println("Found new email" + currentText)
  90. continue
  91. }
  92. switch pd.state {
  93. case StateHeaderScan:
  94. if currentText == "" {
  95. if pd.mandatoryHeaders&AtLeastOneHeaderMask == AtLeastOneHeaderMask { //Cause we read at least one header
  96. pd.previousHeader = nil
  97. boundaryCapture := utils.RegExpUtilsInstance().BoundaryFinder.FindStringSubmatch(pd.bodyContentType)
  98. if len(boundaryCapture) == 2 {
  99. pd.activeBoundary = boundaryCapture[1]
  100. } else {
  101. pd.activeBoundary = ""
  102. }
  103. pd.state = StateBodyScan
  104. }
  105. } else {
  106. pd.parseHeader(currentText)
  107. }
  108. case StateBodyScan:
  109. // if currentText == "" {
  110. // if pd.state == StateBodyScan && pd.activeBoundary == "" {
  111. // if pd.mandatoryHeaders == AllHeaderMask {
  112. // pd.parseBody()
  113. // emails = append(emails, pd.email)
  114. // }
  115. // pd.reset()
  116. // continue
  117. // }
  118. // }
  119. // if pd.activeBoundary != "" {
  120. pd.bodyData += currentText + "\n"
  121. capture := utils.RegExpUtilsInstance().BoundaryEndFinder.FindStringSubmatch(currentText)
  122. if len(capture) == 2 && pd.activeBoundary == capture[1] {
  123. pd.state = StateBodyScan
  124. pd.activeBoundary = ""
  125. }
  126. // }
  127. }
  128. }
  129. if pd.state == StateBodyScan {
  130. if pd.mandatoryHeaders == AllHeaderMask {
  131. pd.parseBody()
  132. emails = append(emails, pd.email)
  133. }
  134. pd.reset()
  135. }
  136. return emails
  137. }
  138. func (pd *parseData) parseHeader(headerRaw string) {
  139. capture := utils.RegExpUtilsInstance().HeaderFinder.FindStringSubmatch(headerRaw)
  140. //Parse header
  141. if len(capture) == 3 {
  142. // fmt.Printf("capture Header %s : %s\n", strings.ToLower(capture[0]), strings.ToLower(capture[1]))
  143. header := strings.ToLower(capture[1])
  144. pd.mandatoryHeaders |= AtLeastOneHeaderMask
  145. switch header {
  146. case "from":
  147. pd.previousHeader = &pd.email.Header.From
  148. pd.mandatoryHeaders |= FromHeaderMask
  149. case "to":
  150. pd.previousHeader = &pd.email.Header.To
  151. pd.mandatoryHeaders |= ToHeaderMask
  152. case "x-original-to":
  153. if pd.email.Header.To == "" {
  154. pd.previousHeader = &pd.email.Header.To
  155. pd.mandatoryHeaders |= ToHeaderMask
  156. }
  157. case "cc":
  158. pd.previousHeader = &pd.email.Header.Cc
  159. case "bcc":
  160. pd.previousHeader = &pd.email.Header.Bcc
  161. pd.mandatoryHeaders |= ToHeaderMask
  162. case "subject":
  163. pd.previousHeader = &pd.email.Header.Subject
  164. case "date":
  165. pd.previousHeader = nil
  166. unixTime, err := parseDate(strings.Trim(capture[2], " \t"))
  167. if err == nil {
  168. pd.email.Header.Date = unixTime
  169. pd.mandatoryHeaders |= DateHeaderMask
  170. } else {
  171. log.Printf("Unable to parse message: %s\n", err)
  172. }
  173. case "content-type":
  174. pd.previousHeader = &pd.bodyContentType
  175. default:
  176. pd.previousHeader = nil
  177. }
  178. if pd.previousHeader != nil {
  179. *pd.previousHeader = strings.Trim(capture[2], " \t")
  180. }
  181. return
  182. }
  183. //Parse folding
  184. capture = utils.RegExpUtilsInstance().FoldingFinder.FindStringSubmatch(headerRaw)
  185. if len(capture) == 2 && pd.previousHeader != nil {
  186. *pd.previousHeader += capture[1]
  187. }
  188. }
  189. func (pd *parseData) parseBody() {
  190. buffer := bytes.NewBufferString("content-type:" + pd.bodyContentType + "\n\n" + pd.bodyData)
  191. en, err := enmime.ReadEnvelope(buffer)
  192. if err != nil {
  193. log.Printf("Unable to read mail body %s\n\nBody content: %s\n\n", err, pd.bodyData)
  194. return
  195. }
  196. pd.email.Body = &common.MailBody{}
  197. pd.email.Body.PlainText = en.Text
  198. pd.email.Body.RichText = en.HTML
  199. for _, attachment := range en.Attachments {
  200. uuid := uuid.New()
  201. fileName := hex.EncodeToString(uuid[:])
  202. attachmentFile, err := os.Create(config.ConfigInstance().AttachmentsPath + "/" + fileName)
  203. log.Printf("Attachment found %s\n", fileName)
  204. if err != nil {
  205. log.Printf("Unable to save attachment %s %s\n", fileName, err)
  206. continue
  207. }
  208. pd.email.Body.Attachments = append(pd.email.Body.Attachments, &common.AttachmentHeader{
  209. Id: fileName,
  210. FileName: attachment.FileName,
  211. ContentType: attachment.ContentType,
  212. })
  213. attachmentFile.Write(attachment.Content)
  214. }
  215. }
  216. func parseDate(stringDate string) (int64, error) {
  217. formatsToTest := []string{
  218. "Mon, _2 Jan 2006 15:04:05 -0700",
  219. time.RFC1123Z,
  220. time.RFC1123,
  221. time.UnixDate,
  222. "Mon, _2 Jan 2006 15:04:05 -0700 (MST)",
  223. "Mon, _2 Jan 2006 15:04:05 -0700 (MST)"}
  224. var err error
  225. for _, format := range formatsToTest {
  226. dateTime, err := time.Parse(format, stringDate)
  227. if err == nil {
  228. return dateTime.Unix(), nil
  229. }
  230. }
  231. return 0, errors.New("Invalid date format " + stringDate + " , " + err.Error())
  232. }