1# pm-jaorig.rc -- Extract embedded original message (simple recipe)
2#
3#   File id
4#
5#       Copyright (C) 1997-2010 Jari Aalto
6#
7#       This program is free software; you can redistribute it and/or
8#       modify it under the terms of the GNU General Public License as
9#       published by the Free Software Foundation; either version 2 of the
10#       License, or (at your option) any later version
11#
12#       This program is distributed in the hope that it will be useful, but
13#       WITHOUT ANY WARRANTY; without even the implied warranty of
14#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15#       General Public License for more details at
16#       <http://www.gnu.org/copyleft/gpl.html>.
17#
18#   Documentation
19#
20#       This subroutine digs embedded message from the body and replaces
21#       current message with it. Copy the message to folder before calling
22#       this subroutine if you need original.
23#
24#       NOTE: This is _simple_ tool and the sole purpose is to derive
25#       simple embedded messages. Write full fledged perl script if you
26#       want better extracting features. The used AWK inside this procmail
27#       recipe will fail to find 30% of the cases, mostly due to non-standard
28#       way of including the message. The recognized formats are as follows.
29#       Anything that differs from these are ignored or incorrectly parsed.
30#
31#       o   Message is embedded left flushed "as is". With full headers or
32#           Minimum of `From:' `Subject' `Received'
33#       o   The embedded message is quoted with `>' with optional _one_
34#           space.
35#
36#   Where you would use this module
37#
38#       If you're subscribed to mailing lists that regularly sent copies of
39#       original message to the list, like forwarding spam to SPAM-L
40#       mailing list at http://bounce.to/dmuth, then you'd like to
41#       extract the original embedded message which you can then feed to
42#       your UBE filter to test if the shield holds.
43#
44#           <spam-l-request@peach.ease.lsoft.com>
45#           subscribe SPAM-L <First name> <Last name>
46#
47#       This recipe takes simplistic approach and tries it's best to
48#       extract embedded message. Idea for this recipe comes from Era
49#       Eriksson's posting "recipe to turn list postings back into original
50#       spam" 1998-06-25 in Procmail mailing list.
51#
52#       o   Body must contain headers
53#       o   Remove all `>' quotations.
54#       o   extract everything to the end of message. (There are no means
55#           to get rid of the attached signatures that ot forwarding poster
56#           or list server may have attached.
57#
58#   How the message is extracted
59#
60#       When this recipe ends, the current message has been modified so that
61#       it _is_ _the_ _original_ _message_. Like if you would receive:
62#
63#           HEADER-1        # The poster
64#           body-1          # his comments
65#           HEADER-2        # The original embedded message
66#           body-2
67#           body-1          # And poster's signature or mailing list footer
68#
69#       The message now looks like
70#
71#           HEADER-2
72#           body-2
73#           body-1
74#
75#       And you can save this as original message or feed it to your
76#       UBE filter and test if it detects it.
77#
78#   Code note: procmail or awk core dump
79#
80#       For some reason procmail kept dumping core I write the code in
81#       more nicer format like below, but if I made it compact, then it
82#       didn't dump core. Go figure. I'm not pleased that I had to sacrafice
83#       clarity, but there was no other way.
84#
85#           [The good style]            [The forced compact style]
86#           if ()                       if () { statement }
87#           {
88#               statement
89#           }
90#
91#       I have no explanation why this
92#       happens, the same AWK code would work just fine most of the cases and
93#       then came this message `x' and caused dumping the code, if I feed
94#       some other message, I didn't get core dump. Total mystery to me.
95#       Don't let the log message fool you, this had nothing to do
96#       regexp  "^[> ]*From:.*[a-zA-Z]". If I deletd one line from AWK
97#       script, it worked ok, if I added it back the core dump happened with
98#       that message `x'
99#
100#           procmail: Assigning "pfx=[> ]*"
101#           procmail: No match on "^[> ]*From:.*[a-zA-Z]"
102#           Segmentation fault (core dumped)
103#
104#   Required settings
105#
106#       PMSRC must point to source directory of procmail includerc code.
107#       This subroutine needs module(s):
108#
109#       o   pm-javar.rc
110#
111#   Call arguments (variables to set before calling)
112#
113#       (none)
114#
115#   Usage Example
116#
117#       Let's assume that you want to feed all forwarded UBE that is
118#       posted to spam-l mailing list to your filter and see if it needs
119#       improving by checking the logs later. The forwarded UBE to the list
120#       is labelled "SPAM:" in the subject line.
121#
122#           $RC_LIST = $PMSRC/pm-jalist.rc   # mailing list detector
123#           $RC_ORIG = $PMSRC/pm-jaorig.rc   # extract original
124#           $RC_UBE  = $PMSRC/pm-jaube.rc    # UBE filter
125#
126#           ...
127#
128#           INCLUDERC = $RC_LIST            # defines variable `LIST'
129#
130#           :0
131#           * ! LIST ^^^^
132#           {
133#               :0                          # spam-l mailing list
134#               * LIST ?? spam
135#               * Subject: +SPAM:
136#               {
137#                   INCLUDERC = $RC_ORIG    # Change it to UBE message
138#
139#                   #   Ok, next feed it to filter, set some variables first
140#                   #   Log = Short log; What filters were applied to message
141#                   #   mbx = If message was trapped, save it here
142#
143#                   JA_UBE_LOG      = "$PMSRC/pm-ube.log"
144#                   JA_UBE_MBOX     = "junk.ube.ok.mbox"
145#
146#                   INCLUDERC = $RC_UBE
147#
148#                   :0 :            # If comes here, filter failed
149#                   junk.ube.nok.mbx
150#               }
151#
152#               :0 :                    # save normal list messages
153#               list.$LIST
154#           }
155#
156#
157#
158#   Change Log (none)
159
160
161# ............................................................ &init ...
162
163
164id    = "pm-jaorig.rc"
165dummy = "
166========================================================================
167$id: init:
168"
169
170:0
171* !  WSPC ?? ( )
172{
173    INCLUDERC = $PMSRC/pm-javar.rc
174}
175
176# .......................................................... &detect ...
177
178pfx = "[> ]*"
179
180:0
181*$  B ?? ^${pfx}From:.*$a
182*$  B ?? ^${pfx}Subject:.*$a
183*$  B ?? ^${pfx}Received: +from
184{
185
186    #   Remove all original headers from the message,
187    #   AWK will shift up the headers in the body
188
189    :0 fhw
190    | echo ""
191
192
193    :0              # make sure LINEBUF has value or procmail dumps core
194    *$ ! LINEBUF ?? $d
195    {
196        LINEBUF = 8192
197    }
198
199    #   Set bigger LINEBUF. Otherwise procmail/awk may dump core when
200    #   a big message is handled.
201
202    savedLinebuf = $LINEBUF
203    LINEBUF      = 524280
204
205    savedShell   = $SHELLMETAS
206    SHELLMETAS
207
208    #   How it works:
209    #
210    #   *)  Because AWK can't do multi line matches, we have to use two flags:
211    #       `prevHead' is set to 1 as soon as header line is detected.
212    #       _BUT_ in order to start header, it must follow with next
213    #       header immediately. The `head' is set to 1 only if previous
214    #       line was header.
215    #
216    #       --> when `head' is raised, then main starts printing the message
217    #
218    #   *)  do case insensitive match and filter out certain lines.
219    #
220    #   *)  `Received:' Header is immediate indication of message.
221    #
222    #   *)  Remove `>' quotation
223    #
224    #   *)  Print embedded message
225    #
226    #       variable `end' is set as soon as there is end of headers.
227    #       we fix continued non-header start lines by preceding them
228    #       with two spaces. (Forwarded message to spam-l was all left
229    #       flushed, thus breaking the proper RFC header continuation)
230    #       Var `end' tells when we must not fix lines any more.
231    #
232
233    :0 fhbw
234    | $AWK                                                                  \
235      '                                                                     \
236        /^[>]* ?([A-Z][-a-zA-Z]+:|^From )/                                  \
237        {  if( prevHead && !head) {print prev; head=1}  }                   \
238                                                                            \
239        {                                                                   \
240            str = tolower($0);                                              \
241            if ( match(str, "forwarded message") > 0)          { next }     \
242            if ( match(str, "received: +from|return-path:")>0) { head = 1 } \
243                                                                            \
244            if ( head == 1 )                                                \
245            {                                                               \
246                sub("^[>]","");                                             \
247                if ( match($0,"^[ ]*$") )           { end = 1 }             \
248                if ( !end )                         { sub("^ +","")}        \
249                if ( !end && match($0,"[A-Z][-a-zA-Z]+:|^From ") < 1)       \
250                {  $0 = "  " $0; }                                          \
251                print;                                                      \
252            }                                                               \
253                                                                            \
254            prevHead = 0;                                                   \
255            if ( match($0, "^[>]* ?([A-Z][-a-zA-Z]+:|^From )") > 0 )        \
256            { prevHead = 1; prev = $0; }                                    \
257        }                                                                   \
258      '
259
260    LINEBUF     = $savedLinebuf
261    SHELLMETAS  = $savedShell
262
263    #   Add a From_ line if it doesn't have one.
264
265    :0 fhw
266    *$ ! ^^From$s+
267    | $FORMAIL
268
269}
270
271dummy = "$id: end:"
272
273# End of file pm-jaorig.rc
274