Upload
This commit is contained in:
parent
c3e8f84e3c
commit
db703d6ed6
207
max/max-scraper.py
Normal file
207
max/max-scraper.py
Normal file
@ -0,0 +1,207 @@
|
||||
import os
|
||||
import time
|
||||
import random
|
||||
import string
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
SEEN_LINKS_FILE = "seen_links.txt"
|
||||
|
||||
GENRE_URLS = [
|
||||
"https://play.max.com/genre/action",
|
||||
"https://play.max.com/genre/comedy",
|
||||
"https://play.max.com/genre/drama",
|
||||
"https://play.max.com/genre/horror",
|
||||
"https://play.max.com/genre/sci-fi",
|
||||
"https://play.max.com/genre/documentary"
|
||||
]
|
||||
|
||||
def setup_browser():
|
||||
options = Options()
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--window-size=1920,1080")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||
|
||||
def scroll_to_bottom(driver, max_wait=60):
|
||||
print("\U0001f4dc Scrolling to load all movies...")
|
||||
last_height = driver.execute_script("return document.body.scrollHeight")
|
||||
start_time = time.time()
|
||||
|
||||
while True:
|
||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(2.5)
|
||||
new_height = driver.execute_script("return document.body.scrollHeight")
|
||||
if new_height == last_height:
|
||||
print("\u2705 Reached bottom of page.")
|
||||
break
|
||||
last_height = new_height
|
||||
if time.time() - start_time > max_wait:
|
||||
print("\u23f1\ufe0f Scroll timed out.")
|
||||
break
|
||||
|
||||
def extract_movie_links(driver):
|
||||
anchors = driver.find_elements(By.XPATH, '//a[contains(@href, "/movie/")]')
|
||||
links = list(set([a.get_attribute("href") for a in anchors if "/movie/" in a.get_attribute("href")]))
|
||||
return links
|
||||
|
||||
def generate_random_query():
|
||||
letters = string.ascii_lowercase
|
||||
return ''.join(random.choices(letters, k=random.choice([1, 2])))
|
||||
|
||||
def load_seen_links():
|
||||
if not os.path.exists(SEEN_LINKS_FILE):
|
||||
return set()
|
||||
with open(SEEN_LINKS_FILE, "r") as f:
|
||||
return set([line.strip() for line in f if line.strip()])
|
||||
|
||||
def save_seen_links(new_links):
|
||||
with open(SEEN_LINKS_FILE, "a") as f:
|
||||
for link in new_links:
|
||||
f.write(link + "\n")
|
||||
|
||||
def save_links(links, batch_id, prefix):
|
||||
output_dir = "output_movies"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
for i in range(0, len(links), 10):
|
||||
chunk = links[i:i+10]
|
||||
file_path = os.path.join(output_dir, f"{prefix}_{batch_id}_{(i // 10) + 1}.txt")
|
||||
with open(file_path, "w") as f:
|
||||
for link in chunk:
|
||||
f.write(link + "\n")
|
||||
print(f"\U0001f4be Saved {len(chunk)} links to {file_path}")
|
||||
|
||||
def run_movies_mode(driver, seen_links, batch_counter):
|
||||
driver.get("https://play.max.com/movies")
|
||||
time.sleep(4)
|
||||
scroll_to_bottom(driver)
|
||||
all_links = extract_movie_links(driver)
|
||||
print(f"\u2705 Loaded {len(all_links)} movie links.")
|
||||
|
||||
while True:
|
||||
try:
|
||||
count = int(input("\U0001f522 How many random movies do you want to extract? "))
|
||||
break
|
||||
except ValueError:
|
||||
print("\u274c Invalid number.")
|
||||
|
||||
sample = random.sample(all_links, min(count, len(all_links)))
|
||||
unique_links = [link for link in sample if link not in seen_links]
|
||||
duplicates = len(sample) - len(unique_links)
|
||||
|
||||
if unique_links:
|
||||
save_links(unique_links, batch_counter, "movies")
|
||||
save_seen_links(unique_links)
|
||||
print(f"\u2705 Saved {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
|
||||
else:
|
||||
print("\u26a0\ufe0f All selected links were duplicates.")
|
||||
|
||||
def run_search_mode(driver, seen_links, search_counter):
|
||||
search_terms = [
|
||||
"the", "man", "love", "dark", "moon", "fire", "red", "blue", "night",
|
||||
"girl", "life", "death", "dream", "war", "blood", "star", "light", "king", "queen"
|
||||
]
|
||||
max_attempts = 10
|
||||
found_total = 0
|
||||
|
||||
for attempt in range(max_attempts):
|
||||
query = random.choice(search_terms)
|
||||
url = f"https://play.max.com/search?q={query}"
|
||||
print(f"\n\U0001f50d Searching for query: '{query}'")
|
||||
driver.get(url)
|
||||
time.sleep(4)
|
||||
|
||||
links = extract_movie_links(driver)
|
||||
unique_links = [link for link in links if link not in seen_links]
|
||||
duplicates = len(links) - len(unique_links)
|
||||
|
||||
if unique_links:
|
||||
save_links(unique_links, search_counter, "search")
|
||||
save_seen_links(unique_links)
|
||||
print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
|
||||
search_counter += 1
|
||||
found_total += len(unique_links)
|
||||
else:
|
||||
print(f"\u26a0\ufe0f No new links found for query '{query}'.")
|
||||
|
||||
print("\u23f3 Waiting 10 seconds before next search...")
|
||||
time.sleep(10)
|
||||
|
||||
print(f"\n\ud83d\udcca Search complete. Found total {found_total} new movie links in {max_attempts} searches.")
|
||||
|
||||
def run_genre_mode(driver, seen_links, genre_batch):
|
||||
found_total = 0
|
||||
for genre_url in GENRE_URLS:
|
||||
print(f"\n\U0001f3ac Loading genre: {genre_url}")
|
||||
driver.get(genre_url)
|
||||
time.sleep(4)
|
||||
scroll_to_bottom(driver)
|
||||
links = extract_movie_links(driver)
|
||||
unique_links = [link for link in links if link not in seen_links]
|
||||
duplicates = len(links) - len(unique_links)
|
||||
|
||||
if unique_links:
|
||||
save_links(unique_links, genre_batch, "genre")
|
||||
save_seen_links(unique_links)
|
||||
print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
|
||||
genre_batch += 1
|
||||
found_total += len(unique_links)
|
||||
else:
|
||||
print("\u26a0\ufe0f No new unique links in this genre.")
|
||||
time.sleep(5)
|
||||
|
||||
print("📊 Genre scan complete. Found total", found_total, "new movie links.")
|
||||
|
||||
def main():
|
||||
print("🛠️ Max.com Movie Scraper — Created by Mike | DRMLab.io Project")
|
||||
driver = setup_browser()
|
||||
|
||||
try:
|
||||
print("\U0001f310 Opening Max.com homepage...")
|
||||
driver.get("https://play.max.com")
|
||||
time.sleep(3)
|
||||
|
||||
print("\n\U0001f511 Please log in manually in the browser.")
|
||||
input("\u23f3 After you're logged in, press ENTER here to continue...")
|
||||
|
||||
seen_links = load_seen_links()
|
||||
movie_batch = 1
|
||||
search_batch = 1
|
||||
genre_batch = 1
|
||||
|
||||
while True:
|
||||
print("📋 === MAIN MENU ===")
|
||||
print("1️⃣ Extract movies from /movies page")
|
||||
print("2️⃣ Extract random movies via search")
|
||||
print("3️⃣ Extract movies by genre")
|
||||
print("4️⃣ Exit")
|
||||
choice = input("Enter your choice: ").strip()
|
||||
|
||||
if choice == "1":
|
||||
run_movies_mode(driver, seen_links, movie_batch)
|
||||
movie_batch += 1
|
||||
elif choice == "2":
|
||||
run_search_mode(driver, seen_links, search_batch)
|
||||
search_batch += 1
|
||||
elif choice == "3":
|
||||
run_genre_mode(driver, seen_links, genre_batch)
|
||||
genre_batch += 1
|
||||
elif choice == "4":
|
||||
break
|
||||
else:
|
||||
print("\u274c Invalid choice.")
|
||||
|
||||
finally:
|
||||
again = input("⏹ Do you want to close the browser? (yes/no): ").strip().lower()
|
||||
if again != "yes":
|
||||
main()
|
||||
else:
|
||||
input("✅ Press ENTER to exit.")
|
||||
driver.quit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user