当前位置: 首页 > 工具软件 > crawler-java > 使用案例 >

Shell crawler

彭鸿文
2023-12-01
#!/usr/bin/bash

###############################################################################
# name:Robot
# date:2012-11-09
# desc:download porn pictures from baixingsex
###############################################################################

FIRSTUSE=64
USER_CHOOSE_QUIT=65
UNKNOWN_CHOICE=66
BAD_ARGUMENTS=67


###############################################################################
# show usage FIXME:use literal instead of numeric
###############################################################################
function Usage {
    echo -e "Usage: ./Robot.sh [-o outdir] [-t type] [-s startpage] [-e endpage]"
    echo -e "      -t 42 zipai"
    echo -e "         43 siwa"
    echo -e "         44 toupai"
    echo -e "         45 yazhou"
    echo -e "         46 oumei"
    echo -e "         47 linglei"
    echo -e "         48 mingxing"
}


###############################################################################
# loop for each img url in each page and download it
###############################################################################
function RobotImpl {
    local dir=$1;shift
    local type=$1;shift
    local s=$1;shift
    local e=$(($1+1))
    # cache wget files
    local cache="cache"

    # change working-directory
    [ -d $dir ] || mkdir -p $dir
    cd $dir
    [ -d $cache ] || mkdir $cache


    urlPrefix="http://news.baisex.me/forum-"${type}"-"
    urlSuffix=".html"
    curPage=$s
    while [ $curPage -lt $e ]
    do
    cd $cache

    abslUrl=${urlPrefix}${curPage}${urlSuffix}
    pageHtml=${type}"-"${curPage}".html"
    echo -e "$abslUrl\t$pageHtml"

    # check first if file in cache
    # retry 5 times after each 30s
        # continue not done yet file
    [ -e $pageHtml ] || wget -t 5 -w 30 -c $abslUrl -O $pageHtml 1>/dev/null 2>&1


    # extract each thread's title and url
    tmp=${type}"-"${curPage}.tmp
    title=${type}"-"${curPage}.title
    awk '/<tbody id="normalthread/{\
               getline line;\
               while(line!~/<\/tbody>/){\
                 getline line;\
                 if(line~/atarget/){\
                   print line\
                 }\
               }\
               next\
             }' $pageHtml > $tmp

    awk '{\
               split($0, a, "\"");\
               from=index($0,">");\
               rst=substr($0,from+1);\
               to=index(rst,"<");\
               printf("%s\t%s\n",a[2],substr(rst,0,to-1))
             }' $tmp > $title


    cd ../  

    # now we are in baixingsex/
    while read each_title_thread
    do
        echo -e "$each_title_thread"

        titlename=$(echo $each_title_thread | awk 'BEGIN{IFS='\t'}{print $2}')
        [ -d $titlename ] || mkdir $titlename

        threadUrl=$(echo $each_title_thread | awk 'BEGIN{IFS='\t'}{print $1}')
        abslThreadUrl="http://news.baisex.me/"$threadUrl
        
        # download each thread content without img
            # check if in cache
        [ -e $cache"/"$threadUrl ] || wget -t 5 -w 30 -c $abslThreadUrl -O $cache"/"$threadUrl 1>/dev/null 2>&1        

        # extract each pic's url from threadUrl
        # fortunately, it is not hard to extract them
        cat /dev/null > pic.tmp
        awk '{if($0~/onload/){split($0,a,"\"");print a[2];}}' $cache"/"$threadUrl > pic.tmp
        picIdx=1
        while read each_pic_url
        do
        ext=$(echo $each_pic_url | awk -F'.' '{print $NF}')
        [ -e $titlename"/"$picIdx"."$ext ] || wget -t 5 -w 30 -c $each_pic_url -O $titlename"/"$picIdx"."$ext 1>/dev/null 2>&1
        let "picIdx+=1"
        done < pic.tmp
    done < $cache"/"$title

    # remove tmp file here
    rm -f $cache"/"$title $cache"/"$tmp

    let "curPage+=1"
    done
}


###############################################################################
# check argument and call RobotImpl
###############################################################################
function Robot {
    # save arg to Robot.arg 
    # in case of crash we could restart it 
    cat /dev/null > Robot.arg
    echo $* > Robot.arg

    # set arguments default value
    local dir=/mnt/windows/baixingsex
    local type=42
    local start=1
    local end=10
    
    [ $# -eq 0 ] && Usage && exit $FIRSTUSE

    # parse command-line arguments
    while [ $# -gt 0 ]
    do
    case $1 in
        -o)            # outdir
        shift
        dir=$1
        ;;
        -t)            # type
        shift
        type=$1
        ;;
        -s)            # start page
        shift
        start=$1
        ;;
        -e)            # end page
        shift
        end=$1
        ;;
        *)             # default
        shift
        ;;
    esac
    done

    # check arguments
    [ $type -lt 42 ] || [ $type -gt 48 ] || [ $start -gt $end ] && Usage && exit $BAD_ARGUMENTS

    table=( zipai siwa toupai yazhou oumei linglei mingxing )
    echo -e "dir=$dir type=${table[$(($type-42))]} start=$start end=$end"
    echo -n -e "Are you sure?[y/n]"
    read y_or_n
    case $y_or_n in
    y|yes)
        RobotImpl $dir $type $start $end
        ;;
    n|no)
        exit $USER_CHOOSE_QUIT
        ;;
    *)
        exit $UNKNOWN_CHOICE
        ;;
    esac

}


###############################################################################
# main entry
###############################################################################
Robot $*




#!/usr/bin/bash

###############################################################################
# name:Daemon.sh
# date:2012-11-09
# desc:run in background and monitor Robot process restart it when dead
###############################################################################

ROBOT=Robot.sh
LOG=Robot.log
ARG=Robot.arg

function Daemon {
    while [ 1 ]
    do
    PID=$(ps aux | grep $ROBOT | grep -v "grep" | awk '{print $2}')
    if [ -z $PID ]
    then
        # Robot is dead log time first 
        currenttime=$(date "+%Y-%m-%d %H:%M:%S")
        echo -e $currenttime" [dead]" >> $LOG
        
        # reload arg from Robot.arg restart it
        arg=$(cat $ARG)
        nohup ./Robot.sh $arg &
        currenttime=$(date "+%Y-%m-%d %H:%M:%S")
        echo -e $currenttime" [restart]" >> $LOG
    else
        # too verbose ?
        currenttime=$(date "+%Y-%m-%d %H:%M:%S")
        echo -e $currenttime" [alive]" >> $LOG
    fi

    # relax CPU
    sleep 30
    done
}


###############################################################################
# main entry
###############################################################################
Daemon




#!/usr/bin/bash

###############################################################################
# name:Shutdown.sh
# date:2012-11-10
# desc:shutdown Robot.sh and Daemon.sh
###############################################################################

ROBOT=Robot.sh
DAEMON=Daemon.sh

function shutdown {
    local waittime=2

    echo -e "shutdown..."
    ROBOT_PID=$(ps aux | grep $ROBOT | grep -v "grep" | awk '{print $2}')
    DAEMON_PID=$(ps aux | grep $DAEMON | grep -v "grep" | awk '{print $2}')

    if [ ! -z $DAEMON_PID ];then
    kill $DAEMON_PID
    fi

    if [ ! -z $ROBOT_PID ];then
    kill $ROBOT_PID
    fi
    
    sleep $waittime

    ROBOT_PID=$(ps aux | grep $ROBOT | grep -v "grep" | awk '{print $2}')
    DAEMON_PID=$(ps aux | grep $DAEMON | grep -v "grep" | awk '{print $2}')

    if [[ -z $DAEMON_PID ]] && [[ -z $ROBOT_PID ]];then
    titlenum=$(awk -F'\t' '{print $2}' /mnt/windows/baixingsex/Robot.download | sort -u | wc -l)
    echo -e "Robot has been shutdown"
    echo -e "total $titlenum threads downloaded"
    else
    echo -e "Failed to shutdown Robot"
    fi

    exit 0
}


###############################################################################
# main entry
###############################################################################
shutdown

转载于:https://www.cnblogs.com/Anney/archive/2012/11/10/2763550.html

 类似资料:

相关阅读

相关文章

相关问答