> From: http://www.EMT-EHV.BE.PHILIPS.COM/~Peter.vanMeurs > Date: Fri Apr 2, 7:17am > > Hello , > > Monitoring changes on a complete site is still difficult (allthough pag= > es I can do) > > The printing of a complete website is still unresolved. > > Peter > > http://dummy.us.eu.org/robert on 99/04/02 03:51:37 AM > To: Peter van Meurs/EHV/EMT/BE/PHILIPS@PHILIPS07 > > Hi. Did you get this resolved? I may be able to help if not. You might wanna look at http://dummy.us.eu.org/robert/software/procmail/mime.shtml for converting html (via html-to-ascii). Also, do you know about wget? That's what I use to grab web pages. See http://www.lns.cornell.edu/public/COMP/info/wget/wget_toc.html. If you don't have it, try a web search for it. Below, I'll include the script I use for showing me diffs on a web page. When I call the script by "url_change_mon_rec", it does a recursive retrieval. Unfortunately, I don't have a lot of time to help you more. Perhaps you can adapt this script. #!/bin/csh -f if ($#argv < 1) then exit 1 endif set this_dir=$0 set this_dir=$this_dir:h set rhome=$this_dir:h set name=$0 set name=$name:t if ($name == file_change_mon || $name == file_change_full_mon) then set file="$argv[1]:q" if (-f "$file:q".gz) then if (-f "$file:q") then rm "$file:q".gz else guz -fq "$file:q" endif endif shift set url="$argv[1]:q/$file" #set url1="`echo '$url:q'|sed -e 's,\([^/]/\)[^/].*,\1,'`" else set url="$argv[1]:q" endif echo "$url:q" \ |tr '[:upper:]' '[:lower:]' \ |sed -e 's,/,S,g;s,?,Q,g;s,&,M,g;s,\*,A,g;s,\$,D,g' \ > $TRASHDIR/url_change.1.tmp if (! -s $TRASHDIR/url_change.1.tmp) then echo $argv echo Cannot form file name. exit 1 endif set fn=$rhome/folders/procmailed/urls/"`cat $TRASHDIR/url_change.1.tmp`" set fn="`expr substr $fn:q 1 250`" set ext="`sed -e 's,.*\.,,;s,\(....\).*,\1,' $TRASHDIR/url_change.1.tmp`" #if ($fn:e != htm && $fn:e != html) then # set fn=${fn}.html #endif if (-f $fn.gz) then if (-f $fn) then rm $fn.gz else guz -qf $fn endif endif if ($HOST == fuck \ && $name != file_change_mon && $name != file_change_full_mon) then # check host existence set hostn=`echo "$url:q"|sed -e 's,^[^/]*//,,;s,[:/].*$,,'` nslookup -retry=60 -timeout=60 $hostn |& fgrep Non-existent if ($status == 0) then date echo "$url:q" failed\! exit 1 endif endif if ($name == file_change_mon || $name == file_change_full_mon) then cp "$file:q" $TRASHDIR/url_change.$$.$HOST.1 @ retstat=$status else if ($name == url_change_mon_rec || $name == url_change_mon_fullrec) then set np="-np" if ($name == url_change_mon_fullrec) then set np="" endif set cmd="xargs -l1 file_change_mon" if (! -d $TRASHDIR/$fn:t.dir) then mkdir $TRASHDIR/$fn:t.dir set cmd="cat" endif pushd $TRASHDIR/$fn:t.dir > /dev/null wget -N -T 240 -t 3 -nv -r -R gif,GIF,jpg,JPG \ $np -nH -P . $argv[2-]:q \ "$url:q" \ |& tee $TRASHDIR/url_change.$$.$HOST.1 \ | grep ' -> "' \ | sed -e 's,^.* -> ",,;s,".*$,,;s,^\./,,;s,^,'"'"',;s, *$,'"'"' '"'$url:q'"',;s,%3A,:,g' \ | $cmd @ retstat=$status #cat $TRASHDIR/url_change.$$.$HOST.1 @ numf=`ls -1|wc -l` popd > /dev/null #head -100000 $TRASHDIR/url_change.$$.$HOST.dir/{*.htm*,*.HTM*} > $TRASHDIR/url_change.$$.$HOST.1 #/bin/rm -r $TRASHDIR/url_change.$$.$HOST.dir # since the above recursion takes place, there's no need to do anything # else if ($retstat != 0 && $retstat != 123) then if ($numf == 0) then rmdir $TRASHDIR/$fn:t.dir endif date echo "$url:q" failed\! '('$retstat')' cat $TRASHDIR/url_change.$$.$HOST.1 endif /bin/rm $TRASHDIR/url_change.$$.$HOST.1 exit $retstat else if ($name == url_change_mon_post) then #set echo #set verbose echo "$url:q" | sed -e 's,?.*,,' > $TRASHDIR/url_change.1.tmp echo "$url:q" | sed -e 's,.*?,,' \ | sh -c 'lwp-request -m POST "`cat $TRASHDIR/url_change.1.tmp`" 2>$TRASHDIR/url_change.'$$.$HOST @ retstat=$status #unset echo #unset verbose else if ($name == url_change_mon_lynx) then sh -c "/usr/bin/lynx -source '$url:q' > $TRASHDIR/url_change.$$.$HOST.1 2> $TRASHDIR/url_change.$$.$HOST" @ retstat=$status else wget -T 240 -t 3 -nv -O $TRASHDIR/url_change.$$.$HOST.1 "$url:q" >& $TRASHDIR/url_change.$$.$HOST @ retstat=$status endif if ($retstat != 0) then date echo "$url:q" failed\! '('$retstat')' cat $TRASHDIR/url_change.$$.$HOST /bin/rm -f $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/url_change.$$.$HOST exit $retstat endif if ("$ext" == pdf) then mv $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/$fn:t else dos2unix < $TRASHDIR/url_change.$$.$HOST.1 | cat -v > $TRASHDIR/$fn:t endif /bin/rm -f $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/url_change.$$.$HOST if ($name != url_change_full_mon && $name != file_change_full_mon) then if (-f $fn) then if ( { cmp -s $TRASHDIR/$fn:t $fn } ) then /bin/rm $TRASHDIR/$fn:t exit 0 endif endif endif set cmd=cat if ($name == url_change_mon_bos) then set cmd=(sed -e 's,^[0-9][0-9]*\. ,,') endif if ("$ext" == ps || "$ext" == pdf) then if (-d /usr/local/share/ghostscript) then guz -r /usr/local/share/ghostscript endif pstotext $TRASHDIR/$fn:t | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.txt else sh -c "html-to-ascii $TRASHDIR/$fn:t 2>/dev/null" | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.txt endif if (-f $fn) then if ("$ext" == ps || "$ext" == pdf) then pstotext $fn | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.old.txt else sh -c "html-to-ascii $fn 2>/dev/null" | ridblkln | uniq | $cmd:q \ > $TRASHDIR/$fn:t.old.txt endif if ( ! { cmp -s $TRASHDIR/$fn:t.txt $TRASHDIR/$fn:t.old.txt } ) then date echo "$url:q" diff -wu2 $TRASHDIR/$fn:t.txt $TRASHDIR/$fn:t.old.txt \ | sed -e '/^+/d' /bin/rm $fn #mv $fn $fn:h/url_change_mon.$fn:t #rm $fn:h/url_change_mon.$fn:t if ("$fn:t" == "") then echo Found bug\! echo $fn echo "$url:q" else mv $TRASHDIR/$fn:t $fn endif else if ($name == url_change_full_mon \ || $name == file_change_full_mon) then date echo "$url:q" cat $TRASHDIR/$fn:t.txt /bin/rm $fn #mv $fn $fn:h/url_change_mon.$fn:t #rm $fn:h/url_change_mon.$fn:t if ("$fn:t" == "") then echo Found bug\! echo $fn echo "$url:q" else mv $TRASHDIR/$fn:t $fn endif endif endif /bin/rm $TRASHDIR/$fn:t.old.txt else date echo "$url:q" cat $TRASHDIR/$fn:t.txt if ("$fn:t" == "") then echo Found bug\! echo $fn echo "$url:q" else mv $TRASHDIR/$fn:t $fn endif endif /bin/rm $TRASHDIR/url_change.1.tmp /bin/rm $TRASHDIR/$fn:t.txt exit 0