[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: How to print a complete website, How to monitor changes
- To: http://www.EMT-EHV.BE.PHILIPS.COM/~Peter.vanMeurs
- Subject: Re: How to print a complete website, How to monitor changes
- From: http://dummy.us.eu.org/robert
- Date: Sat, 3 Apr 1999 14:21:01 -0500
- In-Reply-To: <0016800012601426000002L062*@MHS>
- Keywords: http://www.EMT-EHV.BE.PHILIPS.COM/~Peter.vanMeurs
> From: http://www.EMT-EHV.BE.PHILIPS.COM/~Peter.vanMeurs
> Date: Fri Apr 2, 7:17am
>
> Hello ,
>
> Monitoring changes on a complete site is still difficult (allthough pag=
> es I can do)
>
> The printing of a complete website is still unresolved.
>
> Peter
>
> http://dummy.us.eu.org/robert on 99/04/02 03:51:37 AM
> To: Peter van Meurs/EHV/EMT/BE/PHILIPS@PHILIPS07
>
> Hi. Did you get this resolved? I may be able to help if not.
You might wanna look at
http://dummy.us.eu.org/robert/software/procmail/mime.shtml for
converting html (via html-to-ascii).
Also, do you know about wget? That's what I use to grab web pages.
See http://www.lns.cornell.edu/public/COMP/info/wget/wget_toc.html. If
you don't have it, try a web search for it.
Below, I'll include the script I use for showing me diffs on a web page.
When I call the script by "url_change_mon_rec", it does a recursive
retrieval.
Unfortunately, I don't have a lot of time to help you more. Perhaps you
can adapt this script.
#!/bin/csh -f
if ($#argv < 1) then
exit 1
endif
set this_dir=$0
set this_dir=$this_dir:h
set rhome=$this_dir:h
set name=$0
set name=$name:t
if ($name == file_change_mon || $name == file_change_full_mon) then
set file="$argv[1]:q"
if (-f "$file:q".gz) then
if (-f "$file:q") then
rm "$file:q".gz
else
guz -fq "$file:q"
endif
endif
shift
set url="$argv[1]:q/$file"
#set url1="`echo '$url:q'|sed -e 's,\([^/]/\)[^/].*,\1,'`"
else
set url="$argv[1]:q"
endif
echo "$url:q" \
|tr '[:upper:]' '[:lower:]' \
|sed -e 's,/,S,g;s,?,Q,g;s,&,M,g;s,\*,A,g;s,\$,D,g' \
> $TRASHDIR/url_change.1.tmp
if (! -s $TRASHDIR/url_change.1.tmp) then
echo $argv
echo Cannot form file name.
exit 1
endif
set fn=$rhome/folders/procmailed/urls/"`cat $TRASHDIR/url_change.1.tmp`"
set fn="`expr substr $fn:q 1 250`"
set ext="`sed -e 's,.*\.,,;s,\(....\).*,\1,' $TRASHDIR/url_change.1.tmp`"
#if ($fn:e != htm && $fn:e != html) then
# set fn=${fn}.html
#endif
if (-f $fn.gz) then
if (-f $fn) then
rm $fn.gz
else
guz -qf $fn
endif
endif
if ($HOST == fuck \
&& $name != file_change_mon && $name != file_change_full_mon) then
# check host existence
set hostn=`echo "$url:q"|sed -e 's,^[^/]*//,,;s,[:/].*$,,'`
nslookup -retry=60 -timeout=60 $hostn |& fgrep Non-existent
if ($status == 0) then
date
echo "$url:q" failed\!
exit 1
endif
endif
if ($name == file_change_mon || $name == file_change_full_mon) then
cp "$file:q" $TRASHDIR/url_change.$$.$HOST.1
@ retstat=$status
else if ($name == url_change_mon_rec || $name == url_change_mon_fullrec) then
set np="-np"
if ($name == url_change_mon_fullrec) then
set np=""
endif
set cmd="xargs -l1 file_change_mon"
if (! -d $TRASHDIR/$fn:t.dir) then
mkdir $TRASHDIR/$fn:t.dir
set cmd="cat"
endif
pushd $TRASHDIR/$fn:t.dir > /dev/null
wget -N -T 240 -t 3 -nv -r -R gif,GIF,jpg,JPG \
$np -nH -P . $argv[2-]:q \
"$url:q" \
|& tee $TRASHDIR/url_change.$$.$HOST.1 \
| grep ' -> "' \
| sed -e 's,^.* -> ",,;s,".*$,,;s,^\./,,;s,^,'"'"',;s, *$,'"'"' '"'$url:q'"',;s,%3A,:,g' \
| $cmd
@ retstat=$status
#cat $TRASHDIR/url_change.$$.$HOST.1
@ numf=`ls -1|wc -l`
popd > /dev/null
#head -100000 $TRASHDIR/url_change.$$.$HOST.dir/{*.htm*,*.HTM*} > $TRASHDIR/url_change.$$.$HOST.1
#/bin/rm -r $TRASHDIR/url_change.$$.$HOST.dir
# since the above recursion takes place, there's no need to do anything
# else
if ($retstat != 0 && $retstat != 123) then
if ($numf == 0) then
rmdir $TRASHDIR/$fn:t.dir
endif
date
echo "$url:q" failed\! '('$retstat')'
cat $TRASHDIR/url_change.$$.$HOST.1
endif
/bin/rm $TRASHDIR/url_change.$$.$HOST.1
exit $retstat
else if ($name == url_change_mon_post) then
#set echo
#set verbose
echo "$url:q" | sed -e 's,?.*,,' > $TRASHDIR/url_change.1.tmp
echo "$url:q" | sed -e 's,.*?,,' \
| sh -c 'lwp-request -m POST "`cat $TRASHDIR/url_change.1.tmp`" 2>$TRASHDIR/url_change.'$$.$HOST
@ retstat=$status
#unset echo
#unset verbose
else if ($name == url_change_mon_lynx) then
sh -c "/usr/bin/lynx -source '$url:q' > $TRASHDIR/url_change.$$.$HOST.1 2> $TRASHDIR/url_change.$$.$HOST"
@ retstat=$status
else
wget -T 240 -t 3 -nv -O $TRASHDIR/url_change.$$.$HOST.1 "$url:q" >& $TRASHDIR/url_change.$$.$HOST
@ retstat=$status
endif
if ($retstat != 0) then
date
echo "$url:q" failed\! '('$retstat')'
cat $TRASHDIR/url_change.$$.$HOST
/bin/rm -f $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/url_change.$$.$HOST
exit $retstat
endif
if ("$ext" == pdf) then
mv $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/$fn:t
else
dos2unix < $TRASHDIR/url_change.$$.$HOST.1 | cat -v > $TRASHDIR/$fn:t
endif
/bin/rm -f $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/url_change.$$.$HOST
if ($name != url_change_full_mon && $name != file_change_full_mon) then
if (-f $fn) then
if ( { cmp -s $TRASHDIR/$fn:t $fn } ) then
/bin/rm $TRASHDIR/$fn:t
exit 0
endif
endif
endif
set cmd=cat
if ($name == url_change_mon_bos) then
set cmd=(sed -e 's,^[0-9][0-9]*\. ,,')
endif
if ("$ext" == ps || "$ext" == pdf) then
if (-d /usr/local/share/ghostscript) then
guz -r /usr/local/share/ghostscript
endif
pstotext $TRASHDIR/$fn:t | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.txt
else
sh -c "html-to-ascii $TRASHDIR/$fn:t 2>/dev/null" | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.txt
endif
if (-f $fn) then
if ("$ext" == ps || "$ext" == pdf) then
pstotext $fn | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.old.txt
else
sh -c "html-to-ascii $fn 2>/dev/null" | ridblkln | uniq | $cmd:q \
> $TRASHDIR/$fn:t.old.txt
endif
if ( ! { cmp -s $TRASHDIR/$fn:t.txt $TRASHDIR/$fn:t.old.txt } ) then
date
echo "$url:q"
diff -wu2 $TRASHDIR/$fn:t.txt $TRASHDIR/$fn:t.old.txt \
| sed -e '/^+/d'
/bin/rm $fn
#mv $fn $fn:h/url_change_mon.$fn:t
#rm $fn:h/url_change_mon.$fn:t
if ("$fn:t" == "") then
echo Found bug\!
echo $fn
echo "$url:q"
else
mv $TRASHDIR/$fn:t $fn
endif
else
if ($name == url_change_full_mon \
|| $name == file_change_full_mon) then
date
echo "$url:q"
cat $TRASHDIR/$fn:t.txt
/bin/rm $fn
#mv $fn $fn:h/url_change_mon.$fn:t
#rm $fn:h/url_change_mon.$fn:t
if ("$fn:t" == "") then
echo Found bug\!
echo $fn
echo "$url:q"
else
mv $TRASHDIR/$fn:t $fn
endif
endif
endif
/bin/rm $TRASHDIR/$fn:t.old.txt
else
date
echo "$url:q"
cat $TRASHDIR/$fn:t.txt
if ("$fn:t" == "") then
echo Found bug\!
echo $fn
echo "$url:q"
else
mv $TRASHDIR/$fn:t $fn
endif
endif
/bin/rm $TRASHDIR/url_change.1.tmp
/bin/rm $TRASHDIR/$fn:t.txt
exit 0