parser.go 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. /*
  2. * MIT License
  3. *
  4. * Copyright (c) 2020 Alexey Edelev <semlanik@gmail.com>
  5. *
  6. * This file is part of gostfix project https://git.semlanik.org/semlanik/gostfix
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy of this
  9. * software and associated documentation files (the "Software"), to deal in the Software
  10. * without restriction, including without limitation the rights to use, copy, modify,
  11. * merge, publish, distribute, sublicense, and/or sell copies of the Software, and
  12. * to permit persons to whom the Software is furnished to do so, subject to the following
  13. * conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all copies
  16. * or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  19. * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
  20. * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
  21. * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  22. * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23. * DEALINGS IN THE SOFTWARE.
  24. */
  25. package scanner
  26. import (
  27. "bufio"
  28. "bytes"
  29. "encoding/hex"
  30. "errors"
  31. "log"
  32. "os"
  33. "strings"
  34. "time"
  35. "git.semlanik.org/semlanik/gostfix/common"
  36. "git.semlanik.org/semlanik/gostfix/config"
  37. utils "git.semlanik.org/semlanik/gostfix/utils"
  38. "github.com/google/uuid"
  39. enmime "github.com/jhillyerd/enmime"
  40. )
  41. const (
  42. StateHeaderScan = iota
  43. StateBodyScan
  44. )
  45. const (
  46. AtLeastOneHeaderMask = 1 << iota
  47. FromHeaderMask
  48. DateHeaderMask
  49. ToHeaderMask
  50. AllHeaderMask = 15
  51. )
  52. type parseData struct {
  53. state int
  54. mandatoryHeaders int
  55. previousHeader *string
  56. email *common.Mail
  57. bodyContentType string
  58. bodyData string
  59. activeBoundary string
  60. }
  61. func (pd *parseData) reset() {
  62. *pd = parseData{
  63. state: StateHeaderScan,
  64. previousHeader: nil,
  65. mandatoryHeaders: 0,
  66. email: NewEmail(),
  67. bodyContentType: "plain/text",
  68. bodyData: "",
  69. activeBoundary: "",
  70. }
  71. }
  72. func parseFile(file *utils.LockedFile) []*common.Mail {
  73. var emails []*common.Mail
  74. pd := &parseData{}
  75. pd.reset()
  76. scanner := bufio.NewScanner(file)
  77. for scanner.Scan() {
  78. switch pd.state {
  79. case StateHeaderScan:
  80. if scanner.Text() == "" {
  81. if pd.mandatoryHeaders&AtLeastOneHeaderMask == AtLeastOneHeaderMask { //Cause we read at least one header
  82. pd.previousHeader = nil
  83. boundaryCapture := utils.RegExpUtilsInstance().BoundaryFinder.FindStringSubmatch(pd.bodyContentType)
  84. if len(boundaryCapture) == 2 {
  85. pd.activeBoundary = boundaryCapture[1]
  86. } else {
  87. pd.activeBoundary = ""
  88. }
  89. pd.state = StateBodyScan
  90. }
  91. } else {
  92. pd.parseHeader(scanner.Text())
  93. }
  94. case StateBodyScan:
  95. if scanner.Text() == "" {
  96. if pd.state == StateBodyScan && pd.activeBoundary == "" {
  97. if pd.mandatoryHeaders == AllHeaderMask {
  98. pd.parseBody()
  99. emails = append(emails, pd.email)
  100. }
  101. pd.reset()
  102. continue
  103. }
  104. }
  105. if pd.activeBoundary != "" {
  106. pd.bodyData += scanner.Text() + "\n"
  107. capture := utils.RegExpUtilsInstance().BoundaryEndFinder.FindStringSubmatch(scanner.Text())
  108. if len(capture) == 2 && pd.activeBoundary == capture[1] {
  109. pd.state = StateBodyScan
  110. pd.activeBoundary = ""
  111. }
  112. }
  113. }
  114. }
  115. if pd.state == StateBodyScan {
  116. if pd.mandatoryHeaders == AllHeaderMask {
  117. pd.parseBody()
  118. emails = append(emails, pd.email)
  119. }
  120. pd.reset()
  121. }
  122. return emails
  123. }
  124. func (pd *parseData) parseHeader(headerRaw string) {
  125. capture := utils.RegExpUtilsInstance().HeaderFinder.FindStringSubmatch(headerRaw)
  126. //Parse header
  127. if len(capture) == 3 {
  128. // fmt.Printf("capture Header %s : %s\n", strings.ToLower(capture[0]), strings.ToLower(capture[1]))
  129. header := strings.ToLower(capture[1])
  130. pd.mandatoryHeaders |= AtLeastOneHeaderMask
  131. switch header {
  132. case "from":
  133. pd.previousHeader = &pd.email.Header.From
  134. pd.mandatoryHeaders |= FromHeaderMask
  135. case "to":
  136. pd.previousHeader = &pd.email.Header.To
  137. pd.mandatoryHeaders |= ToHeaderMask
  138. case "cc":
  139. pd.previousHeader = &pd.email.Header.Cc
  140. case "bcc":
  141. pd.previousHeader = &pd.email.Header.Bcc
  142. pd.mandatoryHeaders |= ToHeaderMask
  143. case "subject":
  144. pd.previousHeader = &pd.email.Header.Subject
  145. case "date":
  146. pd.previousHeader = nil
  147. unixTime, err := parseDate(strings.Trim(capture[2], " \t"))
  148. if err == nil {
  149. pd.email.Header.Date = unixTime
  150. pd.mandatoryHeaders |= DateHeaderMask
  151. } else {
  152. log.Printf("Unable to parse message: %s\n", err)
  153. }
  154. case "content-type":
  155. pd.previousHeader = &pd.bodyContentType
  156. default:
  157. pd.previousHeader = nil
  158. }
  159. if pd.previousHeader != nil {
  160. *pd.previousHeader = strings.Trim(capture[2], " \t")
  161. }
  162. return
  163. }
  164. //Parse folding
  165. capture = utils.RegExpUtilsInstance().FoldingFinder.FindStringSubmatch(headerRaw)
  166. if len(capture) == 2 && pd.previousHeader != nil {
  167. *pd.previousHeader += capture[1]
  168. }
  169. }
  170. func (pd *parseData) parseBody() {
  171. buffer := bytes.NewBufferString("content-type:" + pd.bodyContentType + "\n\n" + pd.bodyData)
  172. en, err := enmime.ReadEnvelope(buffer)
  173. if err != nil {
  174. log.Printf("Unable to read mail body %s\n\nBody content: %s\n\n", err, pd.bodyData)
  175. return
  176. }
  177. pd.email.Body = &common.MailBody{}
  178. pd.email.Body.PlainText = en.Text
  179. pd.email.Body.RichText = en.HTML
  180. for _, attachment := range en.Attachments {
  181. uuid := uuid.New()
  182. fileName := hex.EncodeToString(uuid[:])
  183. attachmentFile, err := os.Create(config.ConfigInstance().AttachmentsPath + "/" + fileName)
  184. log.Printf("Attachment found %s\n", fileName)
  185. if err != nil {
  186. log.Printf("Unable to save attachment %s %s\n", fileName, err)
  187. continue
  188. }
  189. pd.email.Body.Attachments = append(pd.email.Body.Attachments, &common.AttachmentHeader{
  190. Id: fileName,
  191. FileName: attachment.FileName,
  192. ContentType: attachment.ContentType,
  193. })
  194. attachmentFile.Write(attachment.Content)
  195. }
  196. }
  197. func parseDate(stringDate string) (int64, error) {
  198. formatsToTest := []string{"Mon, _2 Jan 2006 15:04:05 -0700", time.RFC1123Z, time.RFC1123, time.UnixDate}
  199. var err error
  200. for _, format := range formatsToTest {
  201. dateTime, err := time.Parse(format, stringDate)
  202. if err == nil {
  203. return dateTime.Unix(), nil
  204. }
  205. }
  206. return 0, errors.New("Invalid date format " + stringDate + " , " + err.Error())
  207. }