用Shell脚本实现自动从NewSmth.net的MyPhoto版下载照片

来源：互联网发布：淘宝0.1元商品编辑：程序博客网时间：2024/04/29 01:24

写了一个脚本，用于自动从水木社区（Newsmth.net）的MyPhoto版自动下载图片到本地。

运行该脚本后，将自动从水木社区的MyPhoto版自动下载最近N（默认为3，通过参数1指定）天内的所有图片到本地的photo目录下。

用法：

1、把以下代码保存为autoPicSmth.sh

2、为脚本增加可执行权限，并运行脚本。

CHEYO:~/auto # chmod +x autoPicSmth.sh
CHEYO:~/auto # ./autoPicSmth.sh

脚本写得比较粗糙，欢迎优化改进。

源码：

#!/bin/bash

#####################################################################

# Script: autoPicSmth.sh

# Author: cheyo

# Email: icheyo at Gmail dot com

# From: www.icheyo.net

# Date: 2008-02-22

# Description:

# This script is used for downloading pictures from the MyPhoto

# board in the newsmth.net automatically.

#####################################################################

# Usage: autoPicSmth.sh [days]

# days: download all pictures of recent /days/ days

# For Example: ./autoPicSmth.sh 3

WORKING_DIR=working

PIC_OUT_DIR=photo

DAYS_TO_DOWN=3

QUERY_FILE=QueryResult.tmp

THREAD_FILE=ThreadUrl.tmp

FORMAT_FILE=ThreadInfo.tmp

CURR_THREAD_FILE=CurrThread.tmp

PIC_URL_FILE=PicUrl.tmp

PIC_DOWN_LOG=PicDown.log

PIC_INFO_FILE1=PicInfo1.tmp

PIC_INFO_FILE2=PicInfo2.tmp

PIC_INFO_FILE3=PicInfoFinal.tmp

# ------------------------------------------------------------------ #

# ShowUsage()

# Show the usage of this script

# ------------------------------------------------------------------ #

ShowUsage()

{

echo "This script is used for automatic downloading pictures from MyPhoto board in the newsmth.net"

echo " Usage: autoPicSmth.sh [days]"

echo " days: download all pictures of recent /days/ days. 3 for default."

echo " Example: ./autoPicSmth.sh 3"

}

# check arguments

if [ $# -gt 1 ]; then

ShowUsage

exit 1;

elif [ $# -eq 1 ]; then

DAYS_TO_DOWN=$1

mkdir -p $WORKING_DIR

cd $WORKING_DIR

# Get the thread search result HTML page to local

SearchUrl="http://bbs4.newsmth.net/bbsbfind.php?q=1&board=MyPhoto&dt=${DAYS_TO_DOWN}&ag=1"

curl "${SearchUrl}" -o ${QUERY_FILE}

# Create a file to store all Thread URLs

egrep "<a href="bbscon.php?bid=" $QUERY_FILE | awk -F[<>"] '{print "http://bbs4.newsmth.net/"$9}' > $THREAD_FILE
ThreadCount=`cat $THREAD_FILE | wc -l`
echo "Total ${ThreadCount} threads are found."

# Create a file to store all BoardId and ThreadId
awk -F[=&] '{print $2,$4}' $THREAD_FILE > $FORMAT_FILE

# Create a file to sotre all pictures infomation
# Format: BoardId ArticleId FileName FileSize FileId
echo "# BoardId ArticleId FileName FileSize FileId" > $PIC_INFO_FILE1

cat $FORMAT_FILE | while read BoardId ArticleId

ThreadUrl=`echo "http://bbs4.newsmth.net/bbscon.php?bid=$BoardId&id=$ArticleId"`

curl "$ThreadUrl" -o $CURR_THREAD_FILE

    grep "attach" $CURR_THREAD_FILE | tr ");" ") " | grep "attach" | awk -F[' ,)] -v BoardId=$BoardId -v ArticleId=$ArticleId '{print BoardId,ArticleId,$2,$5,$7}' >> $PIC_INFO_FILE1
done

# Create a file to store all pictures info with file extention name
# but not full file name.
# Format:  BoardId ArticleId FileExt FileSize FileId
# echo "# BoardId ArticleId FileExt FileSize FileId" > $PIC_INFO_FILE2
awk -F[. ]  '$0~/^[^#]/ {print $1,$2,$4,$5,$6}' $PIC_INFO_FILE1 >> $PIC_INFO_FILE2

# Remove the records which don't contain enough info.

# in normal case, it should be 5 columns in the file.

awk '$5~/^[^$]/ {print $0}' $PIC_INFO_FILE2 > $PIC_INFO_FILE3

# Create a file to store all picture url

grep ^[^#] $PIC_INFO_FILE3 | while read BoardId ArticleId FileExt FileSize FileId

if [ $FileSize -gt 51200 ]; then

FileType="p"

else

FileType="s"

PicUrl=`echo "http://att.newsmth.net/att.php?$FileType.$BoardId.$ArticleId.$FileId.$FileExt"`

echo "$PicUrl" >> $PIC_URL_FILE

done

# Remove all duplicted URL from the file

mv ${PIC_URL_FILE} ${PIC_URL_FILE}.tmp

sort -dfu ${PIC_URL_FILE}.tmp > ${PIC_URL_FILE}

rm ${PIC_URL_FILE}.tmp

# Remove the URLs which have been downed before

if [ -f "../${PIC_OUT_DIR}/${PIC_DOWN_LOG}" ]; then

cp ../$PIC_OUT_DIR/${PIC_DOWN_LOG} .

awk '{print $3}' ${PIC_DOWN_LOG} > ${PIC_URL_FILE}.history

sort -dfu ${PIC_URL_FILE}.history > ${PIC_URL_FILE}.tmp

mv ${PIC_URL_FILE}.tmp ${PIC_URL_FILE}.history

comm -1 -3 ${PIC_URL_FILE}.history ${PIC_URL_FILE} > ${PIC_URL_FILE}.tmp

mv ${PIC_URL_FILE}.tmp ${PIC_URL_FILE}

rm ${PIC_URL_FILE}.history

# Download all pictures from server to local

PicCount=`wc -l $PIC_URL_FILE | awk '{print $1}'`

PicIndex=1

mkdir -p ../$PIC_OUT_DIR

echo "Total number of pictures to be downloaded: $PicCount"

cat $PIC_URL_FILE | while read CurrUrl

FileName=`echo "$CurrUrl" | awk -F[?] '{print $2}'`

echo "[$PicIndex/$PicCount] Start to download $CurrUrl"

curl "$CurrUrl" -o ../$PIC_OUT_DIR/$FileName

# Write download log to log file

CurrTime=`date +"%Y-%m-%d %H:%M:%S"`

echo "$CurrTime $CurrUrl" >> "../$PIC_OUT_DIR/$PIC_DOWN_LOG"

echo "[$PicIndex/$PicCount] Download finished."

echo ""

PicIndex=`expr $PicIndex + 1`

done

#mv $PIC_URL_FILE ../$PIC_OUT_DIR/PicUrl.list

#mv $PIC_INFO_FILE3 ../$PIC_OUT_DIR/PicInfo.list

cd ..

rm -r $WORKING_DIR

echo "All Pictures Downloading finished."

http://write.blog.csdn.net/postedit