[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: How to print a complete website, How to monitor changes




 > From: http://www.EMT-EHV.BE.PHILIPS.COM/~Peter.vanMeurs
 > Date: Fri  Apr  2,  7:17am
 >
 > Hello ,
 > 
 > Monitoring changes on a complete site is still difficult (allthough pag=
 > es I can do)
 > 
 > The printing of a complete website is still unresolved.
 > 
 > Peter
 > 
 > http://dummy.us.eu.org/robert on 99/04/02 03:51:37 AM
 > To: Peter van Meurs/EHV/EMT/BE/PHILIPS@PHILIPS07
 > 
 > Hi.  Did you get this resolved?  I may be able to help if not.

You might wanna look at 
http://dummy.us.eu.org/robert/software/procmail/mime.shtml for
converting html (via html-to-ascii).

Also, do you know about wget?  That's what I use to grab web pages.  
See http://www.lns.cornell.edu/public/COMP/info/wget/wget_toc.html.  If
you don't have it, try a web search for it.

Below, I'll include the script I use for showing me diffs on a web page.
When I call the script by "url_change_mon_rec", it does a recursive
retrieval.

Unfortunately, I don't have a lot of time to help you more.  Perhaps you
can adapt this script.

#!/bin/csh -f
if ($#argv < 1) then
	exit 1
endif
set this_dir=$0
set this_dir=$this_dir:h
set rhome=$this_dir:h
set name=$0
set name=$name:t
if ($name == file_change_mon || $name == file_change_full_mon) then
	set file="$argv[1]:q"
	if (-f "$file:q".gz) then
		if (-f "$file:q") then
			rm "$file:q".gz
		else
			guz -fq "$file:q"
		endif
	endif
	shift
	set url="$argv[1]:q/$file"
	#set url1="`echo '$url:q'|sed -e 's,\([^/]/\)[^/].*,\1,'`"
else
	set url="$argv[1]:q"
endif
echo "$url:q" \
	|tr '[:upper:]' '[:lower:]' \
	|sed -e 's,/,S,g;s,?,Q,g;s,&,M,g;s,\*,A,g;s,\$,D,g' \
	> $TRASHDIR/url_change.1.tmp
if (! -s $TRASHDIR/url_change.1.tmp) then
	echo $argv
	echo Cannot form file name.
	exit 1
endif
set fn=$rhome/folders/procmailed/urls/"`cat $TRASHDIR/url_change.1.tmp`"
set fn="`expr substr $fn:q 1 250`"
set ext="`sed -e 's,.*\.,,;s,\(....\).*,\1,' $TRASHDIR/url_change.1.tmp`"
#if ($fn:e != htm && $fn:e != html) then
#	set fn=${fn}.html
#endif
if (-f $fn.gz) then
	if (-f $fn) then
		rm $fn.gz
	else
		guz -qf $fn
	endif
endif
if ($HOST == fuck \
    && $name != file_change_mon && $name != file_change_full_mon) then
	# check host existence
	set hostn=`echo "$url:q"|sed -e 's,^[^/]*//,,;s,[:/].*$,,'`
	nslookup -retry=60 -timeout=60 $hostn |& fgrep Non-existent
	if ($status == 0) then
		date
		echo "$url:q" failed\!
		exit 1
	endif
endif
if ($name == file_change_mon || $name == file_change_full_mon) then
	cp "$file:q" $TRASHDIR/url_change.$$.$HOST.1
	@ retstat=$status
else if ($name == url_change_mon_rec || $name == url_change_mon_fullrec) then
	set np="-np"
	if ($name == url_change_mon_fullrec) then
		set np=""
	endif
	set cmd="xargs -l1 file_change_mon"
	if (! -d $TRASHDIR/$fn:t.dir) then
		mkdir $TRASHDIR/$fn:t.dir
		set cmd="cat"
	endif
	pushd $TRASHDIR/$fn:t.dir > /dev/null
	wget -N -T 240 -t 3 -nv -r -R gif,GIF,jpg,JPG \
		$np -nH -P . $argv[2-]:q \
		"$url:q" \
		|& tee $TRASHDIR/url_change.$$.$HOST.1 \
		| grep ' -> "' \
		| sed -e 's,^.* -> ",,;s,".*$,,;s,^\./,,;s,^,'"'"',;s, *$,'"'"' '"'$url:q'"',;s,%3A,:,g' \
		| $cmd
	@ retstat=$status
	#cat $TRASHDIR/url_change.$$.$HOST.1
	@ numf=`ls -1|wc -l`
	popd > /dev/null
	#head -100000 $TRASHDIR/url_change.$$.$HOST.dir/{*.htm*,*.HTM*} > $TRASHDIR/url_change.$$.$HOST.1
	#/bin/rm -r $TRASHDIR/url_change.$$.$HOST.dir
	# since the above recursion takes place, there's no need to do anything
	# else
	if ($retstat != 0 && $retstat != 123) then
		if ($numf == 0) then
			rmdir $TRASHDIR/$fn:t.dir
		endif
		date
		echo "$url:q" failed\! '('$retstat')'
		cat $TRASHDIR/url_change.$$.$HOST.1
	endif
	/bin/rm $TRASHDIR/url_change.$$.$HOST.1
	exit $retstat
else if ($name == url_change_mon_post) then
	#set echo
	#set verbose
	echo "$url:q" | sed -e 's,?.*,,' > $TRASHDIR/url_change.1.tmp
	echo "$url:q" | sed -e 's,.*?,,' \
		| sh -c 'lwp-request -m POST "`cat $TRASHDIR/url_change.1.tmp`" 2>$TRASHDIR/url_change.'$$.$HOST
	@ retstat=$status
	#unset echo
	#unset verbose
else if ($name == url_change_mon_lynx) then
	sh -c "/usr/bin/lynx -source '$url:q' > $TRASHDIR/url_change.$$.$HOST.1 2> $TRASHDIR/url_change.$$.$HOST"
	@ retstat=$status
else
	wget -T 240 -t 3 -nv -O $TRASHDIR/url_change.$$.$HOST.1 "$url:q" >& $TRASHDIR/url_change.$$.$HOST
	@ retstat=$status
endif
if ($retstat != 0) then
	date
	echo "$url:q" failed\! '('$retstat')'
	cat $TRASHDIR/url_change.$$.$HOST
	/bin/rm -f $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/url_change.$$.$HOST
	exit $retstat
endif
if ("$ext" == pdf) then
	mv $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/$fn:t
else
	dos2unix < $TRASHDIR/url_change.$$.$HOST.1 | cat -v > $TRASHDIR/$fn:t
endif
/bin/rm -f $TRASHDIR/url_change.$$.$HOST.1 $TRASHDIR/url_change.$$.$HOST
if ($name != url_change_full_mon && $name != file_change_full_mon) then
	if (-f $fn) then
		if ( { cmp -s $TRASHDIR/$fn:t $fn } ) then
			/bin/rm $TRASHDIR/$fn:t
			exit 0
		endif
	endif
endif
set cmd=cat
if ($name == url_change_mon_bos) then
	set cmd=(sed -e 's,^[0-9][0-9]*\. ,,')
endif
if ("$ext" == ps || "$ext" == pdf) then
	if (-d /usr/local/share/ghostscript) then
		guz -r /usr/local/share/ghostscript
	endif
	pstotext $TRASHDIR/$fn:t | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.txt
else
	sh -c "html-to-ascii $TRASHDIR/$fn:t 2>/dev/null" | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.txt
endif
if (-f $fn) then
	if ("$ext" == ps || "$ext" == pdf) then
		pstotext $fn | ridblkln | uniq | $cmd:q > $TRASHDIR/$fn:t.old.txt
	else
		sh -c "html-to-ascii $fn 2>/dev/null" | ridblkln | uniq | $cmd:q \
			> $TRASHDIR/$fn:t.old.txt
	endif
	if ( ! { cmp -s $TRASHDIR/$fn:t.txt $TRASHDIR/$fn:t.old.txt } ) then
		date
		echo "$url:q"
		diff -wu2 $TRASHDIR/$fn:t.txt $TRASHDIR/$fn:t.old.txt \
			| sed -e '/^+/d'
		/bin/rm $fn
		#mv $fn $fn:h/url_change_mon.$fn:t
		#rm $fn:h/url_change_mon.$fn:t
		if ("$fn:t" == "") then
			echo Found bug\!
			echo $fn
			echo "$url:q"
		else
			mv $TRASHDIR/$fn:t $fn
		endif
	else
		if ($name == url_change_full_mon \
		    || $name == file_change_full_mon) then
			date
			echo "$url:q"
			cat $TRASHDIR/$fn:t.txt
			/bin/rm $fn
			#mv $fn $fn:h/url_change_mon.$fn:t
			#rm $fn:h/url_change_mon.$fn:t
			if ("$fn:t" == "") then
				echo Found bug\!
				echo $fn
				echo "$url:q"
			else
				mv $TRASHDIR/$fn:t $fn
			endif
		endif
	endif
	/bin/rm $TRASHDIR/$fn:t.old.txt
else
	date
	echo "$url:q"
	cat $TRASHDIR/$fn:t.txt
	if ("$fn:t" == "") then
		echo Found bug\!
		echo $fn
		echo "$url:q"
	else
		mv $TRASHDIR/$fn:t $fn
	endif
endif
/bin/rm $TRASHDIR/url_change.1.tmp
/bin/rm $TRASHDIR/$fn:t.txt
exit 0





Why do you want this page removed?