# Import the libraries
import plotly.express as px
import pandas as pd
import datetime

print( "Reading the apache log" )

# Read the logfile
df = pd.read_csv( "access.log", header=None, sep=" ", quotechar="\"", escapechar="\\" )
df.head()

print( "Cleaning the log" )

# Drop unused columns
df.drop( [1,2,4,8], axis="columns", inplace=True )

# Remove all errors
df = df[df[6] < 400]

# Filter our own requests
df = df[df[0]!="81.6.49.243"]

# Filter IPs with suspicious access patterns
suspicious = [
  "82.80.249.137",
  "82.80.249.159",
  "82.80.249.249",
  "146.4.22.190",
  "212.227.250.21",
  "95.217.74.38"
]

df = df[df[0].apply( lambda ip: ip not in suspicious )]

# IP is no longer useful, drop it
df.drop( [0], axis="columns", inplace=True )

# Preparation for the bot filter
df[9] = df[9].str.lower()
df.head()

# Filter the bots
bots = [
  "adsbot",
  "adscanner",
  "ahrefsbot",
  "alphabot",
  "alphaseobot",
  "applebot",
  "aspiegelbot",
  "bingbot",
  "blexbot",
  "borneobot",
  "bot@linkfluence",
  "bot@tracemyfile",
  "brands-bot",
  "ccbot",
  "clarabot",
  "cliqzbot",
  "coccocbot",
  "discordbot",
  "dnsresearchbot",
  "domainstatsbot",
  "dotbot",
  "duckduckbot",
  "exabot",
  "facebot",
  "frobots",
  "gigabot",
  "googlebot",
  "internet-structure-research-project-bot",
  "jobboersebot",
  "kazbtbot",
  "keybot",
  "linguee bot",
  "mauibot",
  "mj12bot",
  "msnbot",
  "nimbostratus-bot",
  "niuebot",
  "obot",
  "our-bot",
  "adbeat_bot",
  "petalbot",
  "pinterestbot",
  "pooplebot",
  "ru_bot",
  "scraperbot",
  "semrushbot",
  "seobilitybot",
  "seokicks",
  "serpstatbot",
  "seznambot",
  "sidetrade indexer bot",
  "smtbot",
  "statvoobot",
  "surdotlybot",
  "tigerbot",
  "tmmbot",
  "triplecheckerrobot",
  "twitterbot",
  "vebidoobot",
  "webtechbot",
  "wiederfreibot",
  "x28-job-bot",
  "yacybot",
  "yandexbot",
  "zoominfobot",
  "spider@seocompany.store",
  "barkrowler",
  "website-datenbank.de",
  "crawler_eb_germany",
  "searchatlas.com",
  "adstxtcrawler",
  "backlinkcrawler",
  "cipacrawler",
  "domaincrawler",
  "grapeshotcrawler",
  "mbcrawler",
  "webcrawler",
  "crawler4j",
  "sslyze",
  "localsearch",
  "winhttprequest",
  "webdatastats" ]

for bot in bots:
    df = df[df[9].str.contains(bot)==False]

# Keep just two columns
df.drop( [5,7,9], axis="columns", inplace=True )

print( "Group by days" )

# Convert the string to a date
df[3] = df[3].apply( lambda d: datetime.datetime.strptime(d[1:12],"%d/%b/%Y") )

# Group and count the requests
grp = df.groupby( [3] ).count().reset_index()

print( "Add computed columns" )

# Rename the columns
grp.columns = ["date","count"]

# Add a column for the weekday
grp["weekday"] = grp["date"].apply( lambda d: d.strftime("%a") )

# Add a column for the weekend
grp["weekend"] = grp["date"].apply( lambda d: "no" if d.weekday() < 5 else "yes" )

# Convert the date back to a string!
grp["date"] = grp["date"].apply( lambda d: d.strftime("%d.%m.%y") )

print( "Generate the plot" )

# Plot the data
fig = px.bar( grp,
              x="date",
              y="count",
              color="weekend",
              color_discrete_map={ "yes":"red", "no":"blue" },
              hover_data=["date","count","weekday"],
              category_orders={ "date": grp["date"].tolist() } )
fig.update_yaxes( title="", visible=True, showticklabels=True )
fig.update_layout( showlegend=False )

# Write plot to file and show it
fig.write_html( "tmp.html", auto_open=True )